Grok 12.0.1
math-inl.h
Go to the documentation of this file.
1// Copyright 2020 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Include guard (still compiled once per target)
17#if defined(HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_) == \
18 defined(HWY_TARGET_TOGGLE) // NOLINT
19#ifdef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
20#undef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
21#else
22#define HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
23#endif
24
25#include <stddef.h>
26
27#include "hwy/highway.h"
28
30namespace hwy {
31namespace HWY_NAMESPACE {
32
41template <class D, class V>
42HWY_INLINE V Acos(D d, V x);
43template <class D, class V>
45 return Acos(d, x);
46}
47
56template <class D, class V>
57HWY_INLINE V Acosh(D d, V x);
58template <class D, class V>
60 return Acosh(d, x);
61}
62
71template <class D, class V>
72HWY_INLINE V Asin(D d, V x);
73template <class D, class V>
75 return Asin(d, x);
76}
77
86template <class D, class V>
87HWY_INLINE V Asinh(D d, V x);
88template <class D, class V>
90 return Asinh(d, x);
91}
92
101template <class D, class V>
102HWY_INLINE V Atan(D d, V x);
103template <class D, class V>
105 return Atan(d, x);
106}
107
116template <class D, class V>
117HWY_INLINE V Atanh(D d, V x);
118template <class D, class V>
120 return Atanh(d, x);
121}
122
123// Atan2 was added later and some users may be implementing it themselves, so
124// notify them that this version of Highway defines it already.
125#ifndef HWY_HAVE_ATAN2
126#define HWY_HAVE_ATAN2 1
127#endif
128
136template <class D, class V = VFromD<D>, class M = MFromD<D>,
137 typename T = TFromD<D>>
138HWY_INLINE V Atan2(const D d, V y, V x) {
139 const V kHalf = Set(d, static_cast<T>(+0.5));
140 const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264));
141 const V kPi2 = Mul(kPi, kHalf);
142
143 const V k0 = Zero(d);
144 const M y_0 = Eq(y, k0);
145 const M x_0 = Eq(x, k0);
146 const M x_neg = Lt(x, k0);
147 const M y_inf = IsInf(y);
148 const M x_inf = IsInf(x);
149 const M nan = Or(IsNaN(y), IsNaN(x));
150
151 const V if_xneg_pi = IfThenElseZero(x_neg, kPi);
152 // x= +inf: pi/4; -inf: 3*pi/4; else: pi/2
153 const V if_yinf = Mul(kHalf, IfThenElse(x_inf, Add(kPi2, if_xneg_pi), kPi));
154
155 V t = Atan(d, Div(y, x));
156 // Disambiguate between quadrants 1/3 and 2/4 by adding (Q2: Pi; Q3: -Pi).
157 t = Add(t, CopySignToAbs(if_xneg_pi, y));
158 // Special cases for 0 and infinity:
159 t = IfThenElse(x_inf, if_xneg_pi, t);
160 t = IfThenElse(x_0, kPi2, t);
161 t = IfThenElse(y_inf, if_yinf, t);
162 t = IfThenElse(y_0, if_xneg_pi, t);
163 // Any input NaN => NaN, otherwise fix sign.
164 return IfThenElse(nan, NaN(d), CopySign(t, y));
165}
166template <class D, class V>
168 return Atan2(d, y, x);
169}
170
179template <class D, class V>
180HWY_INLINE V Cos(D d, V x);
181template <class D, class V>
183 return Cos(d, x);
184}
185
194template <class D, class V>
195HWY_INLINE V Exp(D d, V x);
196template <class D, class V>
198 return Exp(d, x);
199}
200
209template <class D, class V>
210HWY_INLINE V Expm1(D d, V x);
211template <class D, class V>
213 return Expm1(d, x);
214}
215
224template <class D, class V>
225HWY_INLINE V Log(D d, V x);
226template <class D, class V>
228 return Log(d, x);
229}
230
239template <class D, class V>
240HWY_INLINE V Log10(D d, V x);
241template <class D, class V>
243 return Log10(d, x);
244}
245
254template <class D, class V>
255HWY_INLINE V Log1p(D d, V x);
256template <class D, class V>
258 return Log1p(d, x);
259}
260
269template <class D, class V>
270HWY_INLINE V Log2(D d, V x);
271template <class D, class V>
273 return Log2(d, x);
274}
275
284template <class D, class V>
285HWY_INLINE V Sin(D d, V x);
286template <class D, class V>
288 return Sin(d, x);
289}
290
299template <class D, class V>
300HWY_INLINE V Sinh(D d, V x);
301template <class D, class V>
303 return Sinh(d, x);
304}
305
314template <class D, class V>
315HWY_INLINE V Tanh(D d, V x);
316template <class D, class V>
318 return Tanh(d, x);
319}
320
331template <class D, class V>
332HWY_INLINE void SinCos(D d, V x, V& s, V& c);
333template <class D, class V>
335 SinCos(d, x, s, c);
336}
337
339// Implementation
341namespace impl {
342
343// Estrin's Scheme is a faster method for evaluating large polynomials on
344// super scalar architectures. It works by factoring the Horner's Method
345// polynomial into power of two sub-trees that can be evaluated in parallel.
346// Wikipedia Link: https://en.wikipedia.org/wiki/Estrin%27s_scheme
347template <class T>
349 return MulAdd(c1, x, c0);
350}
351template <class T>
352HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) {
353 T x2 = Mul(x, x);
354 return MulAdd(x2, c2, MulAdd(c1, x, c0));
355}
356template <class T>
357HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) {
358 T x2 = Mul(x, x);
359 return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0));
360}
361template <class T>
362HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) {
363 T x2 = Mul(x, x);
364 T x4 = Mul(x2, x2);
365 return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
366}
367template <class T>
368HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) {
369 T x2 = Mul(x, x);
370 T x4 = Mul(x2, x2);
371 return MulAdd(x4, MulAdd(c5, x, c4),
372 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
373}
374template <class T>
375HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
376 T c6) {
377 T x2 = Mul(x, x);
378 T x4 = Mul(x2, x2);
379 return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)),
380 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
381}
382template <class T>
383HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
384 T c6, T c7) {
385 T x2 = Mul(x, x);
386 T x4 = Mul(x2, x2);
387 return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
388 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
389}
390template <class T>
391HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
392 T c6, T c7, T c8) {
393 T x2 = Mul(x, x);
394 T x4 = Mul(x2, x2);
395 T x8 = Mul(x4, x4);
396 return MulAdd(x8, c8,
397 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
398 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
399}
400template <class T>
401HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
402 T c6, T c7, T c8, T c9) {
403 T x2 = Mul(x, x);
404 T x4 = Mul(x2, x2);
405 T x8 = Mul(x4, x4);
406 return MulAdd(x8, MulAdd(c9, x, c8),
407 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
408 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
409}
410template <class T>
411HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
412 T c6, T c7, T c8, T c9, T c10) {
413 T x2 = Mul(x, x);
414 T x4 = Mul(x2, x2);
415 T x8 = Mul(x4, x4);
416 return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)),
417 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
418 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
419}
420template <class T>
421HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
422 T c6, T c7, T c8, T c9, T c10, T c11) {
423 T x2 = Mul(x, x);
424 T x4 = Mul(x2, x2);
425 T x8 = Mul(x4, x4);
426 return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)),
427 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
428 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
429}
430template <class T>
431HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
432 T c6, T c7, T c8, T c9, T c10, T c11,
433 T c12) {
434 T x2 = Mul(x, x);
435 T x4 = Mul(x2, x2);
436 T x8 = Mul(x4, x4);
437 return MulAdd(
438 x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
439 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
440 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
441}
442template <class T>
443HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
444 T c6, T c7, T c8, T c9, T c10, T c11,
445 T c12, T c13) {
446 T x2 = Mul(x, x);
447 T x4 = Mul(x2, x2);
448 T x8 = Mul(x4, x4);
449 return MulAdd(x8,
450 MulAdd(x4, MulAdd(c13, x, c12),
451 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
452 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
453 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
454}
455template <class T>
456HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
457 T c6, T c7, T c8, T c9, T c10, T c11,
458 T c12, T c13, T c14) {
459 T x2 = Mul(x, x);
460 T x4 = Mul(x2, x2);
461 T x8 = Mul(x4, x4);
462 return MulAdd(x8,
463 MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)),
464 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
465 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
466 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
467}
468template <class T>
469HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
470 T c6, T c7, T c8, T c9, T c10, T c11,
471 T c12, T c13, T c14, T c15) {
472 T x2 = Mul(x, x);
473 T x4 = Mul(x2, x2);
474 T x8 = Mul(x4, x4);
475 return MulAdd(x8,
476 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
477 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
478 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
479 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
480}
481template <class T>
482HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
483 T c6, T c7, T c8, T c9, T c10, T c11,
484 T c12, T c13, T c14, T c15, T c16) {
485 T x2 = Mul(x, x);
486 T x4 = Mul(x2, x2);
487 T x8 = Mul(x4, x4);
488 T x16 = Mul(x8, x8);
489 return MulAdd(
490 x16, c16,
491 MulAdd(x8,
492 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
493 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
494 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
495 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
496}
497template <class T>
498HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
499 T c6, T c7, T c8, T c9, T c10, T c11,
500 T c12, T c13, T c14, T c15, T c16, T c17) {
501 T x2 = Mul(x, x);
502 T x4 = Mul(x2, x2);
503 T x8 = Mul(x4, x4);
504 T x16 = Mul(x8, x8);
505 return MulAdd(
506 x16, MulAdd(c17, x, c16),
507 MulAdd(x8,
508 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
509 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
510 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
511 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
512}
513template <class T>
514HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
515 T c6, T c7, T c8, T c9, T c10, T c11,
516 T c12, T c13, T c14, T c15, T c16, T c17,
517 T c18) {
518 T x2 = Mul(x, x);
519 T x4 = Mul(x2, x2);
520 T x8 = Mul(x4, x4);
521 T x16 = Mul(x8, x8);
522 return MulAdd(
523 x16, MulAdd(x2, c18, MulAdd(c17, x, c16)),
524 MulAdd(x8,
525 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
526 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
527 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
528 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
529}
530
531template <class FloatOrDouble>
532struct AsinImpl {};
533template <class FloatOrDouble>
534struct AtanImpl {};
535template <class FloatOrDouble>
536struct CosSinImpl {};
537template <class FloatOrDouble>
538struct ExpImpl {};
539template <class FloatOrDouble>
540struct LogImpl {};
541template <class FloatOrDouble>
542struct SinCosImpl {};
543
544template <>
545struct AsinImpl<float> {
546 // Polynomial approximation for asin(x) over the range [0, 0.5).
547 template <class D, class V>
548 HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
549 const auto k0 = Set(d, +0.1666677296f);
550 const auto k1 = Set(d, +0.07495029271f);
551 const auto k2 = Set(d, +0.04547423869f);
552 const auto k3 = Set(d, +0.02424046025f);
553 const auto k4 = Set(d, +0.04197454825f);
554
555 return Estrin(x2, k0, k1, k2, k3, k4);
556 }
557};
558
559#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
560
561template <>
562struct AsinImpl<double> {
563 // Polynomial approximation for asin(x) over the range [0, 0.5).
564 template <class D, class V>
565 HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
566 const auto k0 = Set(d, +0.1666666666666497543);
567 const auto k1 = Set(d, +0.07500000000378581611);
568 const auto k2 = Set(d, +0.04464285681377102438);
569 const auto k3 = Set(d, +0.03038195928038132237);
570 const auto k4 = Set(d, +0.02237176181932048341);
571 const auto k5 = Set(d, +0.01735956991223614604);
572 const auto k6 = Set(d, +0.01388715184501609218);
573 const auto k7 = Set(d, +0.01215360525577377331);
574 const auto k8 = Set(d, +0.006606077476277170610);
575 const auto k9 = Set(d, +0.01929045477267910674);
576 const auto k10 = Set(d, -0.01581918243329996643);
577 const auto k11 = Set(d, +0.03161587650653934628);
578
579 return Estrin(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11);
580 }
581};
582
583#endif
584
585template <>
586struct AtanImpl<float> {
587 // Polynomial approximation for atan(x) over the range [0, 1.0).
588 template <class D, class V>
590 const auto k0 = Set(d, -0.333331018686294555664062f);
591 const auto k1 = Set(d, +0.199926957488059997558594f);
592 const auto k2 = Set(d, -0.142027363181114196777344f);
593 const auto k3 = Set(d, +0.106347933411598205566406f);
594 const auto k4 = Set(d, -0.0748900920152664184570312f);
595 const auto k5 = Set(d, +0.0425049886107444763183594f);
596 const auto k6 = Set(d, -0.0159569028764963150024414f);
597 const auto k7 = Set(d, +0.00282363896258175373077393f);
598
599 const auto y = Mul(x, x);
600 return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x);
601 }
602};
603
604#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
605
606template <>
607struct AtanImpl<double> {
608 // Polynomial approximation for atan(x) over the range [0, 1.0).
609 template <class D, class V>
610 HWY_INLINE V AtanPoly(D d, V x) {
611 const auto k0 = Set(d, -0.333333333333311110369124);
612 const auto k1 = Set(d, +0.199999999996591265594148);
613 const auto k2 = Set(d, -0.14285714266771329383765);
614 const auto k3 = Set(d, +0.111111105648261418443745);
615 const auto k4 = Set(d, -0.090908995008245008229153);
616 const auto k5 = Set(d, +0.0769219538311769618355029);
617 const auto k6 = Set(d, -0.0666573579361080525984562);
618 const auto k7 = Set(d, +0.0587666392926673580854313);
619 const auto k8 = Set(d, -0.0523674852303482457616113);
620 const auto k9 = Set(d, +0.0466667150077840625632675);
621 const auto k10 = Set(d, -0.0407629191276836500001934);
622 const auto k11 = Set(d, +0.0337852580001353069993897);
623 const auto k12 = Set(d, -0.0254517624932312641616861);
624 const auto k13 = Set(d, +0.016599329773529201970117);
625 const auto k14 = Set(d, -0.00889896195887655491740809);
626 const auto k15 = Set(d, +0.00370026744188713119232403);
627 const auto k16 = Set(d, -0.00110611831486672482563471);
628 const auto k17 = Set(d, +0.000209850076645816976906797);
629 const auto k18 = Set(d, -1.88796008463073496563746e-5);
630
631 const auto y = Mul(x, x);
632 return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11,
633 k12, k13, k14, k15, k16, k17, k18),
634 Mul(y, x), x);
635 }
636};
637
638#endif
639
640template <>
641struct CosSinImpl<float> {
642 // Rounds float toward zero and returns as int32_t.
643 template <class D, class V>
645 return ConvertTo(Rebind<int32_t, D>(), x);
646 }
647
648 template <class D, class V>
649 HWY_INLINE V Poly(D d, V x) {
650 const auto k0 = Set(d, -1.66666597127914428710938e-1f);
651 const auto k1 = Set(d, +8.33307858556509017944336e-3f);
652 const auto k2 = Set(d, -1.981069071916863322258e-4f);
653 const auto k3 = Set(d, +2.6083159809786593541503e-6f);
654
655 const auto y = Mul(x, x);
656 return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x);
657 }
658
659 template <class D, class V, class VI32>
660 HWY_INLINE V CosReduce(D d, V x, VI32 q) {
661 // kHalfPiPart0f + kHalfPiPart1f + kHalfPiPart2f + kHalfPiPart3f ~= -pi/2
662 const V kHalfPiPart0f = Set(d, -0.5f * 3.140625f);
663 const V kHalfPiPart1f = Set(d, -0.5f * 0.0009670257568359375f);
664 const V kHalfPiPart2f = Set(d, -0.5f * 6.2771141529083251953e-7f);
665 const V kHalfPiPart3f = Set(d, -0.5f * 1.2154201256553420762e-10f);
666
667 // Extended precision modular arithmetic.
668 const V qf = ConvertTo(d, q);
669 x = MulAdd(qf, kHalfPiPart0f, x);
670 x = MulAdd(qf, kHalfPiPart1f, x);
671 x = MulAdd(qf, kHalfPiPart2f, x);
672 x = MulAdd(qf, kHalfPiPart3f, x);
673 return x;
674 }
675
676 template <class D, class V, class VI32>
677 HWY_INLINE V SinReduce(D d, V x, VI32 q) {
678 // kPiPart0f + kPiPart1f + kPiPart2f + kPiPart3f ~= -pi
679 const V kPiPart0f = Set(d, -3.140625f);
680 const V kPiPart1f = Set(d, -0.0009670257568359375f);
681 const V kPiPart2f = Set(d, -6.2771141529083251953e-7f);
682 const V kPiPart3f = Set(d, -1.2154201256553420762e-10f);
683
684 // Extended precision modular arithmetic.
685 const V qf = ConvertTo(d, q);
686 x = MulAdd(qf, kPiPart0f, x);
687 x = MulAdd(qf, kPiPart1f, x);
688 x = MulAdd(qf, kPiPart2f, x);
689 x = MulAdd(qf, kPiPart3f, x);
690 return x;
691 }
692
693 // (q & 2) == 0 ? -0.0 : +0.0
694 template <class D, class VI32>
696 const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
697 return BitCast(d, ShiftLeft<30>(AndNot(q, kTwo)));
698 }
699
700 // ((q & 1) ? -0.0 : +0.0)
701 template <class D, class VI32>
703 const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
704 return BitCast(d, ShiftLeft<31>(And(q, kOne)));
705 }
706};
707
708#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
709
710template <>
711struct CosSinImpl<double> {
712 // Rounds double toward zero and returns as int32_t.
713 template <class D, class V>
714 HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
715 return DemoteTo(Rebind<int32_t, D>(), x);
716 }
717
718 template <class D, class V>
719 HWY_INLINE V Poly(D d, V x) {
720 const auto k0 = Set(d, -0.166666666666666657414808);
721 const auto k1 = Set(d, +0.00833333333333332974823815);
722 const auto k2 = Set(d, -0.000198412698412696162806809);
723 const auto k3 = Set(d, +2.75573192239198747630416e-6);
724 const auto k4 = Set(d, -2.50521083763502045810755e-8);
725 const auto k5 = Set(d, +1.60590430605664501629054e-10);
726 const auto k6 = Set(d, -7.64712219118158833288484e-13);
727 const auto k7 = Set(d, +2.81009972710863200091251e-15);
728 const auto k8 = Set(d, -7.97255955009037868891952e-18);
729
730 const auto y = Mul(x, x);
731 return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x);
732 }
733
734 template <class D, class V, class VI32>
735 HWY_INLINE V CosReduce(D d, V x, VI32 q) {
736 // kHalfPiPart0d + kHalfPiPart1d + kHalfPiPart2d + kHalfPiPart3d ~= -pi/2
737 const V kHalfPiPart0d = Set(d, -0.5 * 3.1415926218032836914);
738 const V kHalfPiPart1d = Set(d, -0.5 * 3.1786509424591713469e-8);
739 const V kHalfPiPart2d = Set(d, -0.5 * 1.2246467864107188502e-16);
740 const V kHalfPiPart3d = Set(d, -0.5 * 1.2736634327021899816e-24);
741
742 // Extended precision modular arithmetic.
743 const V qf = PromoteTo(d, q);
744 x = MulAdd(qf, kHalfPiPart0d, x);
745 x = MulAdd(qf, kHalfPiPart1d, x);
746 x = MulAdd(qf, kHalfPiPart2d, x);
747 x = MulAdd(qf, kHalfPiPart3d, x);
748 return x;
749 }
750
751 template <class D, class V, class VI32>
752 HWY_INLINE V SinReduce(D d, V x, VI32 q) {
753 // kPiPart0d + kPiPart1d + kPiPart2d + kPiPart3d ~= -pi
754 const V kPiPart0d = Set(d, -3.1415926218032836914);
755 const V kPiPart1d = Set(d, -3.1786509424591713469e-8);
756 const V kPiPart2d = Set(d, -1.2246467864107188502e-16);
757 const V kPiPart3d = Set(d, -1.2736634327021899816e-24);
758
759 // Extended precision modular arithmetic.
760 const V qf = PromoteTo(d, q);
761 x = MulAdd(qf, kPiPart0d, x);
762 x = MulAdd(qf, kPiPart1d, x);
763 x = MulAdd(qf, kPiPart2d, x);
764 x = MulAdd(qf, kPiPart3d, x);
765 return x;
766 }
767
768 // (q & 2) == 0 ? -0.0 : +0.0
769 template <class D, class VI32>
770 HWY_INLINE Vec<Rebind<double, D>> CosSignFromQuadrant(D d, VI32 q) {
771 const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
772 return BitCast(
773 d, ShiftLeft<62>(PromoteTo(Rebind<int64_t, D>(), AndNot(q, kTwo))));
774 }
775
776 // ((q & 1) ? -0.0 : +0.0)
777 template <class D, class VI32>
778 HWY_INLINE Vec<Rebind<double, D>> SinSignFromQuadrant(D d, VI32 q) {
779 const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
780 return BitCast(
781 d, ShiftLeft<63>(PromoteTo(Rebind<int64_t, D>(), And(q, kOne))));
782 }
783};
784
785#endif
786
787template <>
788struct ExpImpl<float> {
789 // Rounds float toward zero and returns as int32_t.
790 template <class D, class V>
792 return ConvertTo(Rebind<int32_t, D>(), x);
793 }
794
795 template <class D, class V>
796 HWY_INLINE V ExpPoly(D d, V x) {
797 const auto k0 = Set(d, +0.5f);
798 const auto k1 = Set(d, +0.166666671633720397949219f);
799 const auto k2 = Set(d, +0.0416664853692054748535156f);
800 const auto k3 = Set(d, +0.00833336077630519866943359f);
801 const auto k4 = Set(d, +0.00139304355252534151077271f);
802 const auto k5 = Set(d, +0.000198527617612853646278381f);
803
804 return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x);
805 }
806
807 // Computes 2^x, where x is an integer.
808 template <class D, class VI32>
810 const Rebind<int32_t, D> di32;
811 const VI32 kOffset = Set(di32, 0x7F);
812 return BitCast(d, ShiftLeft<23>(Add(x, kOffset)));
813 }
814
815 // Sets the exponent of 'x' to 2^e.
816 template <class D, class V, class VI32>
817 HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
818 const VI32 y = ShiftRight<1>(e);
819 return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
820 }
821
822 template <class D, class V, class VI32>
823 HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
824 // kLn2Part0f + kLn2Part1f ~= -ln(2)
825 const V kLn2Part0f = Set(d, -0.693145751953125f);
826 const V kLn2Part1f = Set(d, -1.428606765330187045e-6f);
827
828 // Extended precision modular arithmetic.
829 const V qf = ConvertTo(d, q);
830 x = MulAdd(qf, kLn2Part0f, x);
831 x = MulAdd(qf, kLn2Part1f, x);
832 return x;
833 }
834};
835
836template <>
837struct LogImpl<float> {
838 template <class D, class V>
840 const Rebind<int32_t, D> di32;
841 const Rebind<uint32_t, D> du32;
842 const auto kBias = Set(di32, 0x7F);
843 return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias);
844 }
845
846 // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
847 template <class D, class V>
848 HWY_INLINE V LogPoly(D d, V x) {
849 const V k0 = Set(d, 0.66666662693f);
850 const V k1 = Set(d, 0.40000972152f);
851 const V k2 = Set(d, 0.28498786688f);
852 const V k3 = Set(d, 0.24279078841f);
853
854 const V x2 = Mul(x, x);
855 const V x4 = Mul(x2, x2);
856 return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4));
857 }
858};
859
860#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
861template <>
862struct ExpImpl<double> {
863 // Rounds double toward zero and returns as int32_t.
864 template <class D, class V>
865 HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
866 return DemoteTo(Rebind<int32_t, D>(), x);
867 }
868
869 template <class D, class V>
870 HWY_INLINE V ExpPoly(D d, V x) {
871 const auto k0 = Set(d, +0.5);
872 const auto k1 = Set(d, +0.166666666666666851703837);
873 const auto k2 = Set(d, +0.0416666666666665047591422);
874 const auto k3 = Set(d, +0.00833333333331652721664984);
875 const auto k4 = Set(d, +0.00138888888889774492207962);
876 const auto k5 = Set(d, +0.000198412698960509205564975);
877 const auto k6 = Set(d, +2.4801587159235472998791e-5);
878 const auto k7 = Set(d, +2.75572362911928827629423e-6);
879 const auto k8 = Set(d, +2.75573911234900471893338e-7);
880 const auto k9 = Set(d, +2.51112930892876518610661e-8);
881 const auto k10 = Set(d, +2.08860621107283687536341e-9);
882
883 return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10),
884 Mul(x, x), x);
885 }
886
887 // Computes 2^x, where x is an integer.
888 template <class D, class VI32>
889 HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
890 const Rebind<int32_t, D> di32;
891 const Rebind<int64_t, D> di64;
892 const VI32 kOffset = Set(di32, 0x3FF);
893 return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset))));
894 }
895
896 // Sets the exponent of 'x' to 2^e.
897 template <class D, class V, class VI32>
898 HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
899 const VI32 y = ShiftRight<1>(e);
900 return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
901 }
902
903 template <class D, class V, class VI32>
904 HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
905 // kLn2Part0d + kLn2Part1d ~= -ln(2)
906 const V kLn2Part0d = Set(d, -0.6931471805596629565116018);
907 const V kLn2Part1d = Set(d, -0.28235290563031577122588448175e-12);
908
909 // Extended precision modular arithmetic.
910 const V qf = PromoteTo(d, q);
911 x = MulAdd(qf, kLn2Part0d, x);
912 x = MulAdd(qf, kLn2Part1d, x);
913 return x;
914 }
915};
916
917template <>
918struct LogImpl<double> {
919 template <class D, class V>
920 HWY_INLINE Vec<Rebind<int64_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
921 const Rebind<int64_t, D> di64;
922 const Rebind<uint64_t, D> du64;
923 return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))),
924 Set(di64, 0x3FF));
925 }
926
927 // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
928 template <class D, class V>
929 HWY_INLINE V LogPoly(D d, V x) {
930 const V k0 = Set(d, 0.6666666666666735130);
931 const V k1 = Set(d, 0.3999999999940941908);
932 const V k2 = Set(d, 0.2857142874366239149);
933 const V k3 = Set(d, 0.2222219843214978396);
934 const V k4 = Set(d, 0.1818357216161805012);
935 const V k5 = Set(d, 0.1531383769920937332);
936 const V k6 = Set(d, 0.1479819860511658591);
937
938 const V x2 = Mul(x, x);
939 const V x4 = Mul(x2, x2);
940 return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2,
941 (Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4)));
942 }
943};
944
945#endif
946
947template <class D, class V, bool kAllowSubnormals = true>
948HWY_INLINE V Log(const D d, V x) {
949 // http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info.
950 using T = TFromD<D>;
951 impl::LogImpl<T> impl;
952
953 constexpr bool kIsF32 = (sizeof(T) == 4);
954
955 // Float Constants
956 const V kLn2Hi = Set(d, kIsF32 ? static_cast<T>(0.69313812256f)
957 : static_cast<T>(0.693147180369123816490));
958 const V kLn2Lo = Set(d, kIsF32 ? static_cast<T>(9.0580006145e-6f)
959 : static_cast<T>(1.90821492927058770002e-10));
960 const V kOne = Set(d, static_cast<T>(+1.0));
961 const V kMinNormal = Set(d, kIsF32 ? static_cast<T>(1.175494351e-38f)
962 : static_cast<T>(2.2250738585072014e-308));
963 const V kScale = Set(d, kIsF32 ? static_cast<T>(3.355443200e+7f)
964 : static_cast<T>(1.8014398509481984e+16));
965
966 // Integer Constants
967 using TI = MakeSigned<T>;
968 const Rebind<TI, D> di;
969 using VI = decltype(Zero(di));
970 const VI kLowerBits = Set(di, kIsF32 ? static_cast<TI>(0x00000000L)
971 : static_cast<TI>(0xFFFFFFFFLL));
972 const VI kMagic = Set(di, kIsF32 ? static_cast<TI>(0x3F3504F3L)
973 : static_cast<TI>(0x3FE6A09E00000000LL));
974 const VI kExpMask = Set(di, kIsF32 ? static_cast<TI>(0x3F800000L)
975 : static_cast<TI>(0x3FF0000000000000LL));
976 const VI kExpScale =
977 Set(di, kIsF32 ? static_cast<TI>(-25) : static_cast<TI>(-54));
978 const VI kManMask = Set(di, kIsF32 ? static_cast<TI>(0x7FFFFFL)
979 : static_cast<TI>(0xFFFFF00000000LL));
980
981 // Scale up 'x' so that it is no longer denormalized.
982 VI exp_bits;
983 V exp;
984 if (kAllowSubnormals == true) {
985 const auto is_denormal = Lt(x, kMinNormal);
986 x = IfThenElse(is_denormal, Mul(x, kScale), x);
987
988 // Compute the new exponent.
989 exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
990 const VI exp_scale =
991 BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale)));
992 exp = ConvertTo(
993 d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))));
994 } else {
995 // Compute the new exponent.
996 exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
997 exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)));
998 }
999
1000 // Renormalize.
1001 const V y = Or(And(x, BitCast(d, kLowerBits)),
1002 BitCast(d, Add(And(exp_bits, kManMask), kMagic)));
1003
1004 // Approximate and reconstruct.
1005 const V ym1 = Sub(y, kOne);
1006 const V z = Div(ym1, Add(y, kOne));
1007
1008 return MulSub(
1009 exp, kLn2Hi,
1010 Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1));
1011}
1012
1013// SinCos
1014// Based on "sse_mathfun.h", by Julien Pommier
1015// http://gruntthepeon.free.fr/ssemath/
1016
1017// Third degree poly
1018template <class D, class V>
1020 V& s, V& c) {
1021 using T = TFromD<D>;
1022 using TI = MakeSigned<T>;
1023 using DI = Rebind<TI, D>;
1024 const DI di;
1025 using VI = decltype(Zero(di));
1026 using M = Mask<D>;
1027
1028 static constexpr size_t bits = sizeof(TI) * 8;
1029 const VI sign_mask = SignBit(di);
1030 const VI ci_0 = Zero(di);
1031 const VI ci_1 = Set(di, 1);
1032 const VI ci_2 = Set(di, 2);
1033 const VI ci_4 = Set(di, 4);
1034 const V cos_p0 = Set(d, ConvertScalarTo<T>(2.443315711809948E-005));
1035 const V cos_p1 = Set(d, ConvertScalarTo<T>(-1.388731625493765E-003));
1036 const V cos_p2 = Set(d, ConvertScalarTo<T>(4.166664568298827E-002));
1037 const V sin_p0 = Set(d, ConvertScalarTo<T>(-1.9515295891E-4));
1038 const V sin_p1 = Set(d, ConvertScalarTo<T>(8.3321608736E-3));
1039 const V sin_p2 = Set(d, ConvertScalarTo<T>(-1.6666654611E-1));
1040 const V FOPI = Set(d, ConvertScalarTo<T>(1.27323954473516)); // 4 / M_PI
1041 const V DP1 = Set(d, dp1);
1042 const V DP2 = Set(d, dp2);
1043 const V DP3 = Set(d, dp3);
1044
1045 V xmm1, xmm2, sign_bit_sin, y;
1046 VI imm0, imm2, imm4;
1047
1048 sign_bit_sin = x;
1049 x = Abs(x);
1050
1051 /* extract the sign bit (upper one) */
1052 sign_bit_sin = And(sign_bit_sin, BitCast(d, sign_mask));
1053
1054 /* scale by 4/Pi */
1055 y = Mul(x, FOPI);
1056
1057 /* store the integer part of y in imm2 */
1058 imm2 = ConvertTo(di, y);
1059
1060 /* j=(j+1) & (~1) (see the cephes sources) */
1061 imm2 = Add(imm2, ci_1);
1062 imm2 = AndNot(ci_1, imm2);
1063
1064 y = ConvertTo(d, imm2);
1065 imm4 = imm2;
1066
1067 /* get the swap sign flag for the sine */
1068 imm0 = And(imm2, ci_4);
1069 imm0 = ShiftLeft<bits - 3>(imm0);
1070
1071 V swap_sign_bit_sin = BitCast(d, imm0);
1072
1073 /* get the polynomial selection mask for the sine*/
1074 imm2 = And(imm2, ci_2);
1075 M poly_mask = RebindMask(d, Eq(imm2, ci_0));
1076
1077 /* The magic pass: "Extended precision modular arithmetic"
1078 x = ((x - y * DP1) - y * DP2) - y * DP3; */
1079 x = MulAdd(y, DP1, x);
1080 x = MulAdd(y, DP2, x);
1081 x = MulAdd(y, DP3, x);
1082
1083 imm4 = Sub(imm4, ci_2);
1084 imm4 = AndNot(imm4, ci_4);
1085 imm4 = ShiftLeft<bits - 3>(imm4);
1086
1087 V sign_bit_cos = BitCast(d, imm4);
1088
1089 sign_bit_sin = Xor(sign_bit_sin, swap_sign_bit_sin);
1090
1091 /* Evaluate the first polynomial (0 <= x <= Pi/4) */
1092 V z = Mul(x, x);
1093
1094 y = MulAdd(cos_p0, z, cos_p1);
1095 y = MulAdd(y, z, cos_p2);
1096 y = Mul(y, z);
1097 y = Mul(y, z);
1098 y = NegMulAdd(z, Set(d, 0.5f), y);
1099 y = Add(y, Set(d, 1));
1100
1101 /* Evaluate the second polynomial (Pi/4 <= x <= 0) */
1102 V y2 = MulAdd(sin_p0, z, sin_p1);
1103 y2 = MulAdd(y2, z, sin_p2);
1104 y2 = Mul(y2, z);
1105 y2 = MulAdd(y2, x, x);
1106
1107 /* select the correct result from the two polynomials */
1108 xmm1 = IfThenElse(poly_mask, y2, y);
1109 xmm2 = IfThenElse(poly_mask, y, y2);
1110
1111 /* update the sign */
1112 s = Xor(xmm1, sign_bit_sin);
1113 c = Xor(xmm2, sign_bit_cos);
1114}
1115
1116// Sixth degree poly
1117template <class D, class V>
1119 V& s, V& c) {
1120 using T = TFromD<D>;
1121 using TI = MakeSigned<T>;
1122 using DI = Rebind<TI, D>;
1123 const DI di;
1124 using VI = decltype(Zero(di));
1125 using M = Mask<D>;
1126
1127 static constexpr size_t bits = sizeof(TI) * 8;
1128 const VI sign_mask = SignBit(di);
1129 const VI ci_0 = Zero(di);
1130 const VI ci_1 = Set(di, 1);
1131 const VI ci_2 = Set(di, 2);
1132 const VI ci_4 = Set(di, 4);
1133 const V cos_p0 = Set(d, ConvertScalarTo<T>(-1.13585365213876817300E-11));
1134 const V cos_p1 = Set(d, ConvertScalarTo<T>(2.08757008419747316778E-9));
1135 const V cos_p2 = Set(d, ConvertScalarTo<T>(-2.75573141792967388112E-7));
1136 const V cos_p3 = Set(d, ConvertScalarTo<T>(2.48015872888517045348E-5));
1137 const V cos_p4 = Set(d, ConvertScalarTo<T>(-1.38888888888730564116E-3));
1138 const V cos_p5 = Set(d, ConvertScalarTo<T>(4.16666666666665929218E-2));
1139 const V sin_p0 = Set(d, ConvertScalarTo<T>(1.58962301576546568060E-10));
1140 const V sin_p1 = Set(d, ConvertScalarTo<T>(-2.50507477628578072866E-8));
1141 const V sin_p2 = Set(d, ConvertScalarTo<T>(2.75573136213857245213E-6));
1142 const V sin_p3 = Set(d, ConvertScalarTo<T>(-1.98412698295895385996E-4));
1143 const V sin_p4 = Set(d, ConvertScalarTo<T>(8.33333333332211858878E-3));
1144 const V sin_p5 = Set(d, ConvertScalarTo<T>(-1.66666666666666307295E-1));
1145 const V FOPI = // 4 / M_PI
1146 Set(d, ConvertScalarTo<T>(1.2732395447351626861510701069801148));
1147 const V DP1 = Set(d, dp1);
1148 const V DP2 = Set(d, dp2);
1149 const V DP3 = Set(d, dp3);
1150
1151 V xmm1, xmm2, sign_bit_sin, y;
1152 VI imm0, imm2, imm4;
1153
1154 sign_bit_sin = x;
1155 x = Abs(x);
1156
1157 /* extract the sign bit (upper one) */
1158 sign_bit_sin = And(sign_bit_sin, BitCast(d, sign_mask));
1159
1160 /* scale by 4/Pi */
1161 y = Mul(x, FOPI);
1162
1163 /* store the integer part of y in imm2 */
1164 imm2 = ConvertTo(di, y);
1165
1166 /* j=(j+1) & (~1) (see the cephes sources) */
1167 imm2 = Add(imm2, ci_1);
1168 imm2 = AndNot(ci_1, imm2);
1169
1170 y = ConvertTo(d, imm2);
1171 imm4 = imm2;
1172
1173 /* get the swap sign flag for the sine */
1174 imm0 = And(imm2, ci_4);
1175 imm0 = ShiftLeft<bits - 3>(imm0);
1176
1177 V swap_sign_bit_sin = BitCast(d, imm0);
1178
1179 /* get the polynomial selection mask for the sine*/
1180 imm2 = And(imm2, ci_2);
1181 M poly_mask = RebindMask(d, Eq(imm2, ci_0));
1182
1183 /* The magic pass: "Extended precision modular arithmetic"
1184 x = ((x - y * DP1) - y * DP2) - y * DP3; */
1185 x = MulAdd(y, DP1, x);
1186 x = MulAdd(y, DP2, x);
1187 x = MulAdd(y, DP3, x);
1188
1189 imm4 = Sub(imm4, ci_2);
1190 imm4 = AndNot(imm4, ci_4);
1191 imm4 = ShiftLeft<bits - 3>(imm4);
1192
1193 V sign_bit_cos = BitCast(d, imm4);
1194 sign_bit_sin = Xor(sign_bit_sin, swap_sign_bit_sin);
1195
1196 /* Evaluate the first polynomial (0 <= x <= Pi/4) */
1197 V z = Mul(x, x);
1198
1199 y = MulAdd(cos_p0, z, cos_p1);
1200 y = MulAdd(y, z, cos_p2);
1201 y = MulAdd(y, z, cos_p3);
1202 y = MulAdd(y, z, cos_p4);
1203 y = MulAdd(y, z, cos_p5);
1204 y = Mul(y, z);
1205 y = Mul(y, z);
1206 y = NegMulAdd(z, Set(d, 0.5f), y);
1207 y = Add(y, Set(d, 1.0f));
1208
1209 /* Evaluate the second polynomial (Pi/4 <= x <= 0) */
1210 V y2 = MulAdd(sin_p0, z, sin_p1);
1211 y2 = MulAdd(y2, z, sin_p2);
1212 y2 = MulAdd(y2, z, sin_p3);
1213 y2 = MulAdd(y2, z, sin_p4);
1214 y2 = MulAdd(y2, z, sin_p5);
1215 y2 = Mul(y2, z);
1216 y2 = MulAdd(y2, x, x);
1217
1218 /* select the correct result from the two polynomials */
1219 xmm1 = IfThenElse(poly_mask, y2, y);
1220 xmm2 = IfThenElse(poly_mask, y, y2);
1221
1222 /* update the sign */
1223 s = Xor(xmm1, sign_bit_sin);
1224 c = Xor(xmm2, sign_bit_cos);
1225}
1226
1227template <>
1228struct SinCosImpl<float> {
1229 template <class D, class V>
1230 HWY_INLINE void SinCos(D d, V x, V& s, V& c) {
1231 SinCos3(d, -0.78515625f, -2.4187564849853515625e-4f,
1232 -3.77489497744594108e-8f, x, s, c);
1233 }
1234};
1235
1236#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
1237template <>
1238struct SinCosImpl<double> {
1239 template <class D, class V>
1240 HWY_INLINE void SinCos(D d, V x, V& s, V& c) {
1241 SinCos6(d, -7.85398125648498535156E-1, -3.77489470793079817668E-8,
1242 -2.69515142907905952645E-15, x, s, c);
1243 }
1244};
1245#endif
1246
1247} // namespace impl
1248
1249template <class D, class V>
1250HWY_INLINE V Acos(const D d, V x) {
1251 using T = TFromD<D>;
1252
1253 const V kZero = Zero(d);
1254 const V kHalf = Set(d, static_cast<T>(+0.5));
1255 const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264));
1256 const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
1257
1258 const V sign_x = And(SignBit(d), x);
1259 const V abs_x = Xor(x, sign_x);
1260 const auto mask = Lt(abs_x, kHalf);
1261 const V yy =
1262 IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
1263 const V y = IfThenElse(mask, abs_x, Sqrt(yy));
1264
1265 impl::AsinImpl<T> impl;
1266 const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy));
1267
1268 const V t_plus_y = Add(t, y);
1269 const V z =
1270 IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))),
1271 Add(t_plus_y, t_plus_y));
1272 return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z));
1273}
1274
1275template <class D, class V>
1276HWY_INLINE V Acosh(const D d, V x) {
1277 using T = TFromD<D>;
1278
1279 const V kLarge = Set(d, static_cast<T>(268435456.0));
1280 const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
1281 const V kOne = Set(d, static_cast<T>(+1.0));
1282 const V kTwo = Set(d, static_cast<T>(+2.0));
1283
1284 const auto is_x_large = Gt(x, kLarge);
1285 const auto is_x_gt_2 = Gt(x, kTwo);
1286
1287 const V x_minus_1 = Sub(x, kOne);
1288 const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x)));
1289 const V y1 =
1290 Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1);
1291 const V y2 =
1292 IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne));
1293 const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
1294
1295 const auto is_pole = Eq(y2, kOne);
1296 const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
1297 return Add(IfThenElse(is_x_gt_2, z,
1298 IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))),
1299 IfThenElseZero(is_x_large, kLog2));
1300}
1301
1302template <class D, class V>
1303HWY_INLINE V Asin(const D d, V x) {
1304 using T = TFromD<D>;
1305
1306 const V kHalf = Set(d, static_cast<T>(+0.5));
1307 const V kTwo = Set(d, static_cast<T>(+2.0));
1308 const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
1309
1310 const V sign_x = And(SignBit(d), x);
1311 const V abs_x = Xor(x, sign_x);
1312 const auto mask = Lt(abs_x, kHalf);
1313 const V yy =
1314 IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
1315 const V y = IfThenElse(mask, abs_x, Sqrt(yy));
1316
1317 impl::AsinImpl<T> impl;
1318 const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y);
1319 const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo);
1320 return Or(IfThenElse(mask, z0, z1), sign_x);
1321}
1322
1323template <class D, class V>
1324HWY_INLINE V Asinh(const D d, V x) {
1325 using T = TFromD<D>;
1326
1327 const V kSmall = Set(d, static_cast<T>(1.0 / 268435456.0));
1328 const V kLarge = Set(d, static_cast<T>(268435456.0));
1329 const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
1330 const V kOne = Set(d, static_cast<T>(+1.0));
1331 const V kTwo = Set(d, static_cast<T>(+2.0));
1332
1333 const V sign_x = And(SignBit(d), x); // Extract the sign bit
1334 const V abs_x = Xor(x, sign_x);
1335
1336 const auto is_x_large = Gt(abs_x, kLarge);
1337 const auto is_x_lt_2 = Lt(abs_x, kTwo);
1338
1339 const V x2 = Mul(x, x);
1340 const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne));
1341
1342 const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x)));
1343 const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x);
1344 const V y2 =
1345 IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0));
1346 const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
1347
1348 const auto is_pole = Eq(y2, kOne);
1349 const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
1350 const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor));
1351 const V y = IfThenElse(Lt(abs_x, kSmall), x, large);
1352 return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)),
1353 sign_x);
1354}
1355
1356template <class D, class V>
1357HWY_INLINE V Atan(const D d, V x) {
1358 using T = TFromD<D>;
1359
1360 const V kOne = Set(d, static_cast<T>(+1.0));
1361 const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
1362
1363 const V sign = And(SignBit(d), x);
1364 const V abs_x = Xor(x, sign);
1365 const auto mask = Gt(abs_x, kOne);
1366
1367 impl::AtanImpl<T> impl;
1368 const auto divisor = IfThenElse(mask, abs_x, kOne);
1369 const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x));
1370 return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign);
1371}
1372
1373template <class D, class V>
1374HWY_INLINE V Atanh(const D d, V x) {
1375 using T = TFromD<D>;
1376
1377 const V kHalf = Set(d, static_cast<T>(+0.5));
1378 const V kOne = Set(d, static_cast<T>(+1.0));
1379
1380 const V sign = And(SignBit(d), x); // Extract the sign bit
1381 const V abs_x = Xor(x, sign);
1382 return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))),
1383 Xor(kHalf, sign));
1384}
1385
1386template <class D, class V>
1387HWY_INLINE V Cos(const D d, V x) {
1388 using T = TFromD<D>;
1390
1391 // Float Constants
1392 const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
1393
1394 // Integer Constants
1395 const Rebind<int32_t, D> di32;
1396 using VI32 = decltype(Zero(di32));
1397 const VI32 kOne = Set(di32, 1);
1398
1399 const V y = Abs(x); // cos(x) == cos(|x|)
1400
1401 // Compute the quadrant, q = int(|x| / pi) * 2 + 1
1402 const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne);
1403
1404 // Reduce range, apply sign, and approximate.
1405 return impl.Poly(
1406 d, Xor(impl.CosReduce(d, y, q), impl.CosSignFromQuadrant(d, q)));
1407}
1408
1409template <class D, class V>
1410HWY_INLINE V Exp(const D d, V x) {
1411 using T = TFromD<D>;
1412
1413 const V kHalf = Set(d, static_cast<T>(+0.5));
1414 const V kLowerBound =
1415 Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
1416 const V kNegZero = Set(d, static_cast<T>(-0.0));
1417 const V kOne = Set(d, static_cast<T>(+1.0));
1418 const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));
1419
1420 impl::ExpImpl<T> impl;
1421
1422 // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
1423 const auto q =
1424 impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));
1425
1426 // Reduce, approximate, and then reconstruct.
1427 const V y = impl.LoadExpShortRange(
1428 d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q);
1429 return IfThenElseZero(Ge(x, kLowerBound), y);
1430}
1431
1432template <class D, class V>
1433HWY_INLINE V Expm1(const D d, V x) {
1434 using T = TFromD<D>;
1435
1436 const V kHalf = Set(d, static_cast<T>(+0.5));
1437 const V kLowerBound =
1438 Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
1439 const V kLn2Over2 = Set(d, static_cast<T>(+0.346573590279972654708616));
1440 const V kNegOne = Set(d, static_cast<T>(-1.0));
1441 const V kNegZero = Set(d, static_cast<T>(-0.0));
1442 const V kOne = Set(d, static_cast<T>(+1.0));
1443 const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));
1444
1445 impl::ExpImpl<T> impl;
1446
1447 // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
1448 const auto q =
1449 impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));
1450
1451 // Reduce, approximate, and then reconstruct.
1452 const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q));
1453 const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y,
1454 Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne));
1455 return IfThenElse(Lt(x, kLowerBound), kNegOne, z);
1456}
1457
1458template <class D, class V>
1459HWY_INLINE V Log(const D d, V x) {
1460 return impl::Log<D, V, /*kAllowSubnormals=*/true>(d, x);
1461}
1462
1463template <class D, class V>
1464HWY_INLINE V Log10(const D d, V x) {
1465 using T = TFromD<D>;
1466 return Mul(Log(d, x), Set(d, static_cast<T>(0.4342944819032518276511)));
1467}
1468
1469template <class D, class V>
1470HWY_INLINE V Log1p(const D d, V x) {
1471 using T = TFromD<D>;
1472 const V kOne = Set(d, static_cast<T>(+1.0));
1473
1474 const V y = Add(x, kOne);
1475 const auto is_pole = Eq(y, kOne);
1476 const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne);
1477 const auto non_pole =
1478 Mul(impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y), Div(x, divisor));
1479 return IfThenElse(is_pole, x, non_pole);
1480}
1481
1482template <class D, class V>
1483HWY_INLINE V Log2(const D d, V x) {
1484 using T = TFromD<D>;
1485 return Mul(Log(d, x), Set(d, static_cast<T>(1.44269504088896340735992)));
1486}
1487
1488template <class D, class V>
1489HWY_INLINE V Sin(const D d, V x) {
1490 using T = TFromD<D>;
1492
1493 // Float Constants
1494 const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
1495 const V kHalf = Set(d, static_cast<T>(0.5));
1496
1497 // Integer Constants
1498 const Rebind<int32_t, D> di32;
1499 using VI32 = decltype(Zero(di32));
1500
1501 const V abs_x = Abs(x);
1502 const V sign_x = Xor(abs_x, x);
1503
1504 // Compute the quadrant, q = int((|x| / pi) + 0.5)
1505 const VI32 q = impl.ToInt32(d, MulAdd(abs_x, kOneOverPi, kHalf));
1506
1507 // Reduce range, apply sign, and approximate.
1508 return impl.Poly(d, Xor(impl.SinReduce(d, abs_x, q),
1509 Xor(impl.SinSignFromQuadrant(d, q), sign_x)));
1510}
1511
1512template <class D, class V>
1513HWY_INLINE V Sinh(const D d, V x) {
1514 using T = TFromD<D>;
1515 const V kHalf = Set(d, static_cast<T>(+0.5));
1516 const V kOne = Set(d, static_cast<T>(+1.0));
1517 const V kTwo = Set(d, static_cast<T>(+2.0));
1518
1519 const V sign = And(SignBit(d), x); // Extract the sign bit
1520 const V abs_x = Xor(x, sign);
1521 const V y = Expm1(d, abs_x);
1522 const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf));
1523 return Xor(z, sign); // Reapply the sign bit
1524}
1525
1526template <class D, class V>
1527HWY_INLINE V Tanh(const D d, V x) {
1528 using T = TFromD<D>;
1529 const V kLimit = Set(d, static_cast<T>(18.714973875));
1530 const V kOne = Set(d, static_cast<T>(+1.0));
1531 const V kTwo = Set(d, static_cast<T>(+2.0));
1532
1533 const V sign = And(SignBit(d), x); // Extract the sign bit
1534 const V abs_x = Xor(x, sign);
1535 const V y = Expm1(d, Mul(abs_x, kTwo));
1536 const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo)));
1537 return Xor(z, sign); // Reapply the sign bit
1538}
1539
1540template <class D, class V>
1541HWY_INLINE void SinCos(const D d, V x, V& s, V& c) {
1542 using T = TFromD<D>;
1544 impl.SinCos(d, x, s, c);
1545}
1546
1547// NOLINTNEXTLINE(google-readability-namespace-comments)
1548} // namespace HWY_NAMESPACE
1549} // namespace hwy
1551
1552#endif // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
#define HWY_NOINLINE
Definition base.h:103
#define HWY_INLINE
Definition base.h:101
#define HWY_MAYBE_UNUSED
Definition base.h:113
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1)
Definition math-inl.h:348
HWY_INLINE V Log(const D d, V x)
Definition math-inl.h:948
HWY_INLINE void SinCos6(D d, TFromD< D > dp1, TFromD< D > dp2, TFromD< D > dp3, V x, V &s, V &c)
Definition math-inl.h:1118
HWY_INLINE void SinCos3(D d, TFromD< D > dp1, TFromD< D > dp2, TFromD< D > dp3, V x, V &s, V &c)
Definition math-inl.h:1019
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_NOINLINE V CallSin(const D d, VecArg< V > x)
Definition math-inl.h:287
HWY_NOINLINE V CallSinCos(const D d, VecArg< V > x, VecArg< V > &s, VecArg< V > &c)
Definition math-inl.h:334
HWY_NOINLINE V CallAsin(const D d, VecArg< V > x)
Definition math-inl.h:74
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API svbool_t IsInf(const V v)
Definition arm_sve-inl.h:1709
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_NOINLINE V CallAcos(const D d, VecArg< V > x)
Definition math-inl.h:44
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_NOINLINE V CallAtan2(const D d, VecArg< V > y, VecArg< V > x)
Definition math-inl.h:167
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
V VecArg
Definition ops/shared-inl.h:69
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_INLINE void SinCos(D d, V x, V &s, V &c)
Highway SIMD version of SinCos.
Definition math-inl.h:1541
HWY_API Vec< D > NaN(D d)
Definition generic_ops-inl.h:82
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_INLINE V Asin(D d, V x)
Highway SIMD version of std::asin(x).
Definition math-inl.h:1303
HWY_INLINE V Cos(D d, V x)
Highway SIMD version of std::cos(x).
Definition math-inl.h:1387
HWY_INLINE V Atan2(const D d, V y, V x)
Highway SIMD version of std::atan2(x).
Definition math-inl.h:138
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_NOINLINE V CallExpm1(const D d, VecArg< V > x)
Definition math-inl.h:212
HWY_INLINE V Acos(D d, V x)
Highway SIMD version of std::acos(x).
Definition math-inl.h:1250
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_INLINE V Log(D d, V x)
Highway SIMD version of std::log(x).
Definition math-inl.h:1459
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_NOINLINE V CallLog1p(const D d, VecArg< V > x)
Definition math-inl.h:257
HWY_INLINE V Acosh(D d, V x)
Highway SIMD version of std::acosh(x).
Definition math-inl.h:1276
HWY_NOINLINE V CallLog10(const D d, VecArg< V > x)
Definition math-inl.h:242
HWY_INLINE V Tanh(D d, V x)
Highway SIMD version of std::tanh(x).
Definition math-inl.h:1527
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_NOINLINE V CallLog2(const D d, VecArg< V > x)
Definition math-inl.h:272
HWY_NOINLINE V CallExp(const D d, VecArg< V > x)
Definition math-inl.h:197
HWY_NOINLINE V CallAtanh(const D d, VecArg< V > x)
Definition math-inl.h:119
HWY_INLINE V Exp(D d, V x)
Highway SIMD version of std::exp(x).
Definition math-inl.h:1410
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_NOINLINE V CallAtan(const D d, VecArg< V > x)
Definition math-inl.h:104
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:75
HWY_INLINE V Log10(D d, V x)
Highway SIMD version of std::log10(x).
Definition math-inl.h:1464
HWY_INLINE V Asinh(D d, V x)
Highway SIMD version of std::asinh(x).
Definition math-inl.h:1324
HWY_NOINLINE V CallLog(const D d, VecArg< V > x)
Definition math-inl.h:227
HWY_INLINE V Log2(D d, V x)
Highway SIMD version of std::log2(x).
Definition math-inl.h:1483
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_NOINLINE V CallAsinh(const D d, VecArg< V > x)
Definition math-inl.h:89
decltype(MaskFromVec(Zero(D()))) Mask
Definition generic_ops-inl.h:52
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_INLINE V Sin(D d, V x)
Highway SIMD version of std::sin(x).
Definition math-inl.h:1489
HWY_INLINE V Atanh(D d, V x)
Highway SIMD version of std::atanh(x).
Definition math-inl.h:1374
HWY_API V Div(V a, V b)
Definition arm_sve-inl.h:4639
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_INLINE V Atan(D d, V x)
Highway SIMD version of std::atan(x).
Definition math-inl.h:1357
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:46
HWY_NOINLINE V CallCos(const D d, VecArg< V > x)
Definition math-inl.h:182
HWY_NOINLINE V CallSinh(const D d, VecArg< V > x)
Definition math-inl.h:302
HWY_INLINE V Expm1(D d, V x)
Highway SIMD version of std::expm1(x).
Definition math-inl.h:1433
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_NOINLINE V CallTanh(const D d, VecArg< V > x)
Definition math-inl.h:317
HWY_INLINE V Sinh(D d, V x)
Highway SIMD version of std::sinh(x).
Definition math-inl.h:1513
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_INLINE V Log1p(D d, V x)
Highway SIMD version of std::log1p(x).
Definition math-inl.h:1470
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:1582
HWY_NOINLINE V CallAcosh(const D d, VecArg< V > x)
Definition math-inl.h:59
Definition abort.h:8
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:2080
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
HWY_INLINE V AsinPoly(D d, V x2, V)
Definition math-inl.h:548
Definition math-inl.h:532
HWY_INLINE V AtanPoly(D d, V x)
Definition math-inl.h:589
Definition math-inl.h:534
HWY_INLINE Vec< Rebind< float, D > > SinSignFromQuadrant(D d, VI32 q)
Definition math-inl.h:702
HWY_INLINE Vec< Rebind< float, D > > CosSignFromQuadrant(D d, VI32 q)
Definition math-inl.h:695
HWY_INLINE Vec< Rebind< int32_t, D > > ToInt32(D, V x)
Definition math-inl.h:644
HWY_INLINE V SinReduce(D d, V x, VI32 q)
Definition math-inl.h:677
HWY_INLINE V Poly(D d, V x)
Definition math-inl.h:649
HWY_INLINE V CosReduce(D d, V x, VI32 q)
Definition math-inl.h:660
Definition math-inl.h:536
HWY_INLINE Vec< D > Pow2I(D d, VI32 x)
Definition math-inl.h:809
HWY_INLINE V ExpReduce(D d, V x, VI32 q)
Definition math-inl.h:823
HWY_INLINE V ExpPoly(D d, V x)
Definition math-inl.h:796
HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e)
Definition math-inl.h:817
HWY_INLINE Vec< Rebind< int32_t, D > > ToInt32(D, V x)
Definition math-inl.h:791
Definition math-inl.h:538
HWY_INLINE Vec< Rebind< int32_t, D > > Log2p1NoSubnormal(D, V x)
Definition math-inl.h:839
HWY_INLINE V LogPoly(D d, V x)
Definition math-inl.h:848
Definition math-inl.h:540
HWY_INLINE void SinCos(D d, V x, V &s, V &c)
Definition math-inl.h:1230
Definition math-inl.h:542