Argon 0.1.0
Loading...
Searching...
No Matches
a32.hpp
1#pragma once
2#include "vfpv4.hpp"
3
4#include "arm_simd/shared/a32_float.hpp"
5
6#ifdef __ARM_ACLE
7#include <arm_acle.h>
8#endif
9
10#ifdef __cplusplus
11#ifdef __clang__
12#define nce constexpr
13#else
14#define nce inline
15#endif
16
17namespace neon {
18// clang-format off
19template <typename T> nce T convert_round_to_nearest_with_ties_to_even(float32x2_t a);
20template <typename T> nce T convert_round_toward_negative_infinity(float32x2_t a);
21template <typename T> nce T convert_round_toward_positive_infinity(float32x2_t a);
22template <typename T> nce T convert_round_to_nearest_with_ties_away_from_zero(float32x2_t a);
23template <typename T> nce T reinterpret(int8x8_t a);
24template <typename T> nce T reinterpret(int16x4_t a);
25template <typename T> nce T reinterpret(int32x2_t a);
26template <typename T> nce T reinterpret(float32x2_t a);
27template <typename T> nce T reinterpret(uint8x8_t a);
28template <typename T> nce T reinterpret(uint16x4_t a);
29template <typename T> nce T reinterpret(uint32x2_t a);
30template <typename T> nce T reinterpret(poly8x8_t a);
31template <typename T> nce T reinterpret(poly16x4_t a);
32template <typename T> nce T reinterpret(uint64x1_t a);
33template <typename T> nce T reinterpret(poly64x1_t a);
34template <typename T> nce T reinterpret(float32x4_t a);
35template <typename T> nce T reinterpret(poly8x16_t a);
36template <typename T> nce T reinterpret(poly16x8_t a);
37template <typename T> nce T reinterpret(uint64x2_t a);
38template <typename T> nce T reinterpret(poly64x2_t a);
39template <typename T> nce T reinterpret(float16x8_t a);
40template <typename T> nce T reinterpret(poly128_t a);
41template <typename T> nce T create(uint64_t a);
42template <typename T> nce T duplicate(poly64_t value);
43template <typename T> nce T duplicate(poly64x1_t vec);
44template <typename T> nce T get(poly64x2_t a);
45template <typename T> nce T load1(poly64_t const *ptr);
46template <typename T> nce T load1_duplicate(poly64_t const *ptr);
47template <typename T> nce T load1_x2(poly64_t const *ptr);
48template <typename T> nce T load1_x3(poly64_t const *ptr);
49template <typename T> nce T load1_x4(poly64_t const *ptr);
50template <typename T> nce T store1(poly64_t *ptr, poly64x1_t val);
51template <typename T> nce T store1(poly64_t *ptr, poly64x2_t val);
52template <typename T> nce T convert(int32_t a);
53template <typename T> nce T convert(uint32_t a);
54template <typename T> nce T convert(float16_t a);
55template <typename T> nce T convert_round_to_nearest_with_ties_away_from_zero(float16_t a);
56template <typename T> nce T convert_round_toward_negative_infinity(float16_t a);
57template <typename T> nce T convert_round_to_nearest_with_ties_to_even(float16_t a);
58template <typename T> nce T convert_round_toward_positive_infinity(float16_t a);
59template <typename T> nce T max(float16x4_t a, float16x4_t b);
60template <typename T> nce T max(float16x8_t a, float16x8_t b);
61template <typename T> nce T min(float16x4_t a, float16x4_t b);
62template <typename T> nce T min(float16x8_t a, float16x8_t b);
63template <typename T> nce T max_strict(float16x4_t a, float16x4_t b);
64template <typename T> nce T max_strict(float16x8_t a, float16x8_t b);
65template <typename T> nce T min_strict(float16x4_t a, float16x4_t b);
66template <typename T> nce T min_strict(float16x8_t a, float16x8_t b);
67template <typename T> nce T convert(int16x4_t a);
68template <typename T> nce T convert(uint16x4_t a);
69template <typename T> nce T convert(float16x4_t a);
70template <typename T> nce T convert_round_to_nearest_with_ties_away_from_zero(float16x4_t a);
71template <typename T> nce T convert_round_toward_negative_infinity(float16x4_t a);
72template <typename T> nce T convert_round_to_nearest_with_ties_to_even(float16x4_t a);
73template <typename T> nce T convert_round_toward_positive_infinity(float16x4_t a);
74template <typename T> nce T dot_product(uint32x2_t r, uint8x8_t a, uint8x8_t b);
75template <typename T> nce T dot_product(int32x2_t r, int8x8_t a, int8x8_t b);
76template <typename T> nce T dot_product(uint32x4_t r, uint8x16_t a, uint8x16_t b);
77template <typename T> nce T dot_product(int32x4_t r, int8x16_t a, int8x16_t b);
78template <typename T> nce T multiply_add_long_fused_high(float32x2_t r, float16x4_t a, float16x4_t b);
79template <typename T> nce T multiply_add_long_fused_high(float32x4_t r, float16x8_t a, float16x8_t b) ;
80template <typename T> nce T multiply_add_long_fused_low(float32x2_t r, float16x4_t a, float16x4_t b);
81template <typename T> nce T multiply_add_long_fused_low(float32x4_t r, float16x8_t a, float16x8_t b) ;
82template <typename T> nce T multiply_add_long_fused(float32x2_t r, float16x4_t a, float16x4_t b);
83template <typename T> nce T multiply_add_long_fused(float32x2_t r, float16x4_t a, float16x8_t b);
84template <typename T> nce T multiply_add_long_fused(float32x4_t r, float16x8_t a, float16x4_t b);
85template <typename T> nce T multiply_add_long_fused(float32x4_t r, float16x8_t a, float16x8_t b);
86template <typename T> nce T multiply_subtract_long_fused_high(float32x2_t r, float16x4_t a, float16x4_t b);
87template <typename T> nce T multiply_subtract_long_fused_high(float32x4_t r, float16x8_t a, float16x8_t b) ;
88template <typename T> nce T multiply_subtract_long_fused_low(float32x2_t r, float16x4_t a, float16x4_t b);
89template <typename T> nce T multiply_subtract_long_fused_low(float32x4_t r, float16x8_t a, float16x8_t b) ;
90template <typename T> nce T multiply_subtract_long_fused(float32x2_t r, float16x4_t a, float16x4_t b);
91template <typename T> nce T multiply_subtract_long_fused(float32x2_t r, float16x4_t a, float16x8_t b);
92template <typename T> nce T multiply_subtract_long_fused(float32x4_t r, float16x8_t a, float16x4_t b);
93template <typename T> nce T multiply_subtract_long_fused(float32x4_t r, float16x8_t a, float16x8_t b);
94template <typename T> nce T complex_multiply_add(float16x4_t r, float16x4_t a, float16x4_t b);
95template <typename T> nce T complex_multiply_add(float32x2_t r, float32x2_t a, float32x2_t b);
96template <typename T> nce T complex_multiply_add(float16x8_t r, float16x8_t a, float16x8_t b);
97template <typename T> nce T complex_multiply_add(float32x4_t r, float32x4_t a, float32x4_t b);
98template <typename T> nce T complex_multiply_add_rotate_90(float16x4_t r, float16x4_t a, float16x4_t b);
99template <typename T> nce T complex_multiply_add_rotate_90(float32x2_t r, float32x2_t a, float32x2_t b);
100template <typename T> nce T complex_multiply_add_rotate_90(float16x8_t r, float16x8_t a, float16x8_t b);
101template <typename T> nce T complex_multiply_add_rotate_90(float32x4_t r, float32x4_t a, float32x4_t b);
102template <typename T> nce T complex_multiply_add_rotate_180(float16x4_t r, float16x4_t a, float16x4_t b);
103template <typename T> nce T complex_multiply_add_rotate_180(float32x2_t r, float32x2_t a, float32x2_t b);
104template <typename T> nce T complex_multiply_add_rotate_180(float16x8_t r, float16x8_t a, float16x8_t b);
105template <typename T> nce T complex_multiply_add_rotate_180(float32x4_t r, float32x4_t a, float32x4_t b);
106template <typename T> nce T complex_multiply_add_rotate_270(float16x4_t r, float16x4_t a, float16x4_t b);
107template <typename T> nce T complex_multiply_add_rotate_270(float32x2_t r, float32x2_t a, float32x2_t b);
108template <typename T> nce T complex_multiply_add_rotate_270(float16x8_t r, float16x8_t a, float16x8_t b);
109template <typename T> nce T complex_multiply_add_rotate_270(float32x4_t r, float32x4_t a, float32x4_t b);
110template <typename T> nce T dot_product(int32x2_t r, uint8x8_t a, int8x8_t b);
111template <typename T> nce T dot_product(int32x4_t r, uint8x16_t a, int8x16_t b);
112template <typename T> nce T duplicate(bfloat16_t value);
113template <typename T> nce T duplicate(bfloat16x4_t vec);
114template <typename T> nce T duplicate(bfloat16x8_t vec);
115template <typename T> nce T get(bfloat16x8_t a);
116template <typename T> nce T load1(bfloat16_t const *ptr);
117template <typename T> nce T load1_duplicate(bfloat16_t const *ptr);
118template <typename T> nce T load2(bfloat16_t const *ptr);
119template <typename T> nce T load3(bfloat16_t const *ptr);
120template <typename T> nce T load4(bfloat16_t const *ptr);
121template <typename T> nce T load2_duplicate(bfloat16_t const *ptr);
122template <typename T> nce T load3_duplicate(bfloat16_t const *ptr);
123template <typename T> nce T load4_duplicate(bfloat16_t const *ptr);
124template <typename T> nce T load2(poly64_t const *ptr);
125template <typename T> nce T load3(poly64_t const *ptr);
126template <typename T> nce T load4(poly64_t const *ptr);
127template <typename T> nce T load2_duplicate(poly64_t const *ptr);
128template <typename T> nce T load3_duplicate(poly64_t const *ptr);
129template <typename T> nce T load4_duplicate(poly64_t const *ptr);
130template <typename T> nce T load1_x2(bfloat16_t const *ptr);
131template <typename T> nce T load1_x3(bfloat16_t const *ptr);
132template <typename T> nce T load1_x4(bfloat16_t const *ptr);
133template <typename T> nce T store1(bfloat16_t *ptr, bfloat16x4_t val);
134template <typename T> nce T store1(bfloat16_t *ptr, bfloat16x8_t val);
135template <typename T> nce T store2(bfloat16_t *ptr, bfloat16x4x2_t val);
136template <typename T> nce T store2(bfloat16_t *ptr, bfloat16x8x2_t val);
137template <typename T> nce T store3(bfloat16_t *ptr, bfloat16x4x3_t val);
138template <typename T> nce T store3(bfloat16_t *ptr, bfloat16x8x3_t val);
139template <typename T> nce T store4(bfloat16_t *ptr, bfloat16x4x4_t val);
140template <typename T> nce T store4(bfloat16_t *ptr, bfloat16x8x4_t val);
141template <typename T> nce T reinterpret(bfloat16x4_t a);
142template <typename T> nce T reinterpret(bfloat16x8_t a);
143template <typename T> nce T convert(bfloat16x8_t a);
144template <typename T> nce T convert(float32x4_t a);
145template <typename T> nce T dot_product(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b);
146template <typename T> nce T dot_product(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b);
147template <typename T> nce T multiply_add_long_widen_bottom(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b);
148template <typename T> nce T multiply_add_long_widen_top(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b);
149template <typename T> nce T convert_low(bfloat16x8_t a);
150template <typename T> nce T convert_high(bfloat16x8_t a);
151template <typename T> nce T convert_low(float32x4_t a);
152template <typename T> nce T get_high(poly64x2_t a);
153template <typename T> nce T get_low(poly64x2_t a);
154template <typename T> nce T get_high(bfloat16x8_t a);
155template <typename T> nce T get_low(bfloat16x8_t a);
156
157template <> [[gnu::always_inline]] nce poly64x1_t reinterpret(uint8x8_t a) { return vreinterpret_p64_u8(a); }
158template <> [[gnu::always_inline]] nce bfloat16x4_t reinterpret(uint8x8_t a) { return vreinterpret_bf16_u8(a); }
159template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(uint8x16_t a) { return vreinterpretq_p64_u8(a); }
160template <> [[gnu::always_inline]] nce poly128_t reinterpret(uint8x16_t a) { return vreinterpretq_p128_u8(a); }
161[[gnu::always_inline]] nce uint8x16_t aes_encrypt(uint8x16_t data, uint8x16_t key) { return vaeseq_u8(data, key); }
162[[gnu::always_inline]] nce uint8x16_t aes_decrypt(uint8x16_t data, uint8x16_t key) { return vaesdq_u8(data, key); }
163[[gnu::always_inline]] nce uint8x16_t aes_mix_columns(uint8x16_t data) { return vaesmcq_u8(data); }
164[[gnu::always_inline]] nce uint8x16_t aes_inverse_mix_columns(uint8x16_t data) { return vaesimcq_u8(data); }
165template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(uint8x16_t a) { return vreinterpretq_bf16_u8(a); }
166template <> [[gnu::always_inline]] nce poly64x1_t reinterpret(int8x8_t a) { return vreinterpret_p64_s8(a); }
167template <> [[gnu::always_inline]] nce bfloat16x4_t reinterpret(int8x8_t a) { return vreinterpret_bf16_s8(a); }
168template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(int8x16_t a) { return vreinterpretq_p64_s8(a); }
169template <> [[gnu::always_inline]] nce poly128_t reinterpret(int8x16_t a) { return vreinterpretq_p128_s8(a); }
170template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(int8x16_t a) { return vreinterpretq_bf16_s8(a); }
171template <> [[gnu::always_inline]] nce poly64x1_t reinterpret(uint16x4_t a) { return vreinterpret_p64_u16(a); }
172template <> [[gnu::always_inline]] nce float16x4_t convert(uint16x4_t a) { return vcvt_f16_u16(a); }
173template <int n>[[gnu::always_inline]] nce float16x4_t convert(uint16x4_t a) { return vcvt_n_f16_u16(a, n); }
174template <> [[gnu::always_inline]] nce bfloat16x4_t reinterpret(uint16x4_t a) { return vreinterpret_bf16_u16(a); }
175template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(uint16x8_t a) { return vreinterpretq_p64_u16(a); }
176template <> [[gnu::always_inline]] nce poly128_t reinterpret(uint16x8_t a) { return vreinterpretq_p128_u16(a); }
177template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(uint16x8_t a) { return vreinterpretq_bf16_u16(a); }
178template <> [[gnu::always_inline]] nce poly64x1_t reinterpret(int16x4_t a) { return vreinterpret_p64_s16(a); }
179template <> [[gnu::always_inline]] nce float16x4_t convert(int16x4_t a) { return vcvt_f16_s16(a); }
180template <int n>[[gnu::always_inline]] nce float16x4_t convert(int16x4_t a) { return vcvt_n_f16_s16(a, n); }
181template <> [[gnu::always_inline]] nce bfloat16x4_t reinterpret(int16x4_t a) { return vreinterpret_bf16_s16(a); }
182template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(int16x8_t a) { return vreinterpretq_p64_s16(a); }
183template <> [[gnu::always_inline]] nce poly128_t reinterpret(int16x8_t a) { return vreinterpretq_p128_s16(a); }
184template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(int16x8_t a) { return vreinterpretq_bf16_s16(a); }
185template <> [[gnu::always_inline]] nce poly64x1_t reinterpret(int32x2_t a) { return vreinterpret_p64_s32(a); }
186template <> [[gnu::always_inline]] nce int32x2_t dot_product(int32x2_t r, uint8x8_t a, int8x8_t b) { return vusdot_s32(r, a, b); }
187template <int lane>[[gnu::always_inline]] nce int32x2_t dot_product_lane(int32x2_t r, uint8x8_t a, int8x8_t b) { return vusdot_lane_s32(r, a, b, lane); }
188template <int lane>[[gnu::always_inline]] nce int32x2_t dot_product_lane(int32x2_t r, uint8x8_t a, int8x16_t b) { return vusdot_laneq_s32(r, a, b, lane); }
189template <int lane>[[gnu::always_inline]] nce int32x2_t dot_product_lane(int32x2_t r, int8x8_t a, uint8x8_t b) { return vsudot_lane_s32(r, a, b, lane); }
190template <int lane>[[gnu::always_inline]] nce int32x2_t dot_product_lane(int32x2_t r, int8x8_t a, uint8x16_t b) { return vsudot_laneq_s32(r, a, b, lane); }
191template <> [[gnu::always_inline]] nce int32x2_t dot_product(int32x2_t r, int8x8_t a, int8x8_t b) { return vdot_s32(r, a, b); }
192template <int lane>[[gnu::always_inline]] nce int32x2_t dot_product_lane(int32x2_t r, int8x8_t a, int8x8_t b) { return vdot_lane_s32(r, a, b, lane); }
193template <int lane>[[gnu::always_inline]] nce int32x2_t dot_product_lane(int32x2_t r, int8x8_t a, int8x16_t b) { return vdot_laneq_s32(r, a, b, lane); }
194template <> [[gnu::always_inline]] nce bfloat16x4_t reinterpret(int32x2_t a) { return vreinterpret_bf16_s32(a); }
195template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(int32x4_t a) { return vreinterpretq_p64_s32(a); }
196template <> [[gnu::always_inline]] nce poly128_t reinterpret(int32x4_t a) { return vreinterpretq_p128_s32(a); }
197template <int lane>[[gnu::always_inline]] nce int32x4_t dot_product_lane(int32x4_t r, uint8x16_t a, int8x8_t b) { return vusdotq_lane_s32(r, a, b, lane); }
198[[gnu::always_inline]] nce int32x4_t matrix_multiply_add(int32x4_t r, uint8x16_t a, int8x16_t b) { return vusmmlaq_s32(r, a, b); }
199template <> [[gnu::always_inline]] nce int32x4_t dot_product(int32x4_t r, uint8x16_t a, int8x16_t b) { return vusdotq_s32(r, a, b); }
200template <int lane>[[gnu::always_inline]] nce int32x4_t dot_product_lane(int32x4_t r, uint8x16_t a, int8x16_t b) { return vusdotq_laneq_s32(r, a, b, lane); }
201template <int lane>[[gnu::always_inline]] nce int32x4_t dot_product_lane(int32x4_t r, int8x16_t a, uint8x8_t b) { return vsudotq_lane_s32(r, a, b, lane); }
202template <int lane>[[gnu::always_inline]] nce int32x4_t dot_product_lane(int32x4_t r, int8x16_t a, uint8x16_t b) { return vsudotq_laneq_s32(r, a, b, lane); }
203template <int lane>[[gnu::always_inline]] nce int32x4_t dot_product_lane(int32x4_t r, int8x16_t a, int8x8_t b) { return vdotq_lane_s32(r, a, b, lane); }
204template <> [[gnu::always_inline]] nce int32x4_t dot_product(int32x4_t r, int8x16_t a, int8x16_t b) { return vdotq_s32(r, a, b); }
205template <int lane>[[gnu::always_inline]] nce int32x4_t dot_product_lane(int32x4_t r, int8x16_t a, int8x16_t b) { return vdotq_laneq_s32(r, a, b, lane); }
206[[gnu::always_inline]] nce int32x4_t matrix_multiply_add(int32x4_t r, int8x16_t a, int8x16_t b) { return vmmlaq_s32(r, a, b); }
207template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(int32x4_t a) { return vreinterpretq_bf16_s32(a); }
208template <> [[gnu::always_inline]] nce poly64x1_t reinterpret(uint64x1_t a) { return vreinterpret_p64_u64(a); }
209template <> [[gnu::always_inline]] nce bfloat16x4_t reinterpret(uint64x1_t a) { return vreinterpret_bf16_u64(a); }
210template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(uint64x2_t a) { return vreinterpretq_p64_u64(a); }
211template <> [[gnu::always_inline]] nce poly128_t reinterpret(uint64x2_t a) { return vreinterpretq_p128_u64(a); }
212template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(uint64x2_t a) { return vreinterpretq_bf16_u64(a); }
213template <> [[gnu::always_inline]] nce poly64x1_t reinterpret(uint32x2_t a) { return vreinterpret_p64_u32(a); }
214template <> [[gnu::always_inline]] nce uint32x2_t dot_product(uint32x2_t r, uint8x8_t a, uint8x8_t b) { return vdot_u32(r, a, b); }
215template <int lane>[[gnu::always_inline]] nce uint32x2_t dot_product_lane(uint32x2_t r, uint8x8_t a, uint8x8_t b) { return vdot_lane_u32(r, a, b, lane); }
216template <int lane>[[gnu::always_inline]] nce uint32x2_t dot_product_lane(uint32x2_t r, uint8x8_t a, uint8x16_t b) { return vdot_laneq_u32(r, a, b, lane); }
217template <> [[gnu::always_inline]] nce bfloat16x4_t reinterpret(uint32x2_t a) { return vreinterpret_bf16_u32(a); }
218template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(uint32x4_t a) { return vreinterpretq_p64_u32(a); }
219template <> [[gnu::always_inline]] nce poly128_t reinterpret(uint32x4_t a) { return vreinterpretq_p128_u32(a); }
220template <int lane>[[gnu::always_inline]] nce uint32x4_t dot_product_lane(uint32x4_t r, uint8x16_t a, uint8x8_t b) { return vdotq_lane_u32(r, a, b, lane); }
221template <> [[gnu::always_inline]] nce uint32x4_t dot_product(uint32x4_t r, uint8x16_t a, uint8x16_t b) { return vdotq_u32(r, a, b); }
222template <int lane>[[gnu::always_inline]] nce uint32x4_t dot_product_lane(uint32x4_t r, uint8x16_t a, uint8x16_t b) { return vdotq_laneq_u32(r, a, b, lane); }
223[[gnu::always_inline]] nce uint32x4_t matrix_multiply_add(uint32x4_t r, uint8x16_t a, uint8x16_t b) { return vmmlaq_u32(r, a, b); }
224[[gnu::always_inline]] nce uint32x4_t sha1_schedule_update_0(uint32x4_t w0_3, uint32x4_t w4_7, uint32x4_t w8_11) { return vsha1su0q_u32(w0_3, w4_7, w8_11); }
225[[gnu::always_inline]] nce uint32x4_t sha1_schedule_update_1(uint32x4_t tw0_3, uint32x4_t w12_15) { return vsha1su1q_u32(tw0_3, w12_15); }
226[[gnu::always_inline]] nce uint32x4_t sha256_hash_part_1(uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk) { return vsha256hq_u32(hash_abcd, hash_efgh, wk); }
227[[gnu::always_inline]] nce uint32x4_t sha256_hash_part_2(uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk) { return vsha256h2q_u32(hash_efgh, hash_abcd, wk); }
228[[gnu::always_inline]] nce uint32x4_t sha256_schedule_update_0(uint32x4_t w0_3, uint32x4_t w4_7) { return vsha256su0q_u32(w0_3, w4_7); }
229[[gnu::always_inline]] nce uint32x4_t sha256_schedule_update_1(uint32x4_t tw0_3, uint32x4_t w8_11, uint32x4_t w12_15) { return vsha256su1q_u32(tw0_3, w8_11, w12_15); }
230[[gnu::always_inline]] nce uint32x4_t sha1_choose(uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk) { return vsha1cq_u32(hash_abcd, hash_e, wk); }
231[[gnu::always_inline]] nce uint32x4_t sha1_parity(uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk) { return vsha1pq_u32(hash_abcd, hash_e, wk); }
232[[gnu::always_inline]] nce uint32x4_t sha1_majority(uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk) { return vsha1mq_u32(hash_abcd, hash_e, wk); }
233template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(uint32x4_t a) { return vreinterpretq_bf16_u32(a); }
234[[gnu::always_inline]] nce poly64x1_t reinterpret(float16x4_t a) { return vreinterpret_p64_f16(a); }
235[[gnu::always_inline]] nce float16x4_t absolute(float16x4_t a) { return vabs_f16(a); }
236[[gnu::always_inline]] nce float16x4_t subtract_absolute(float16x4_t a, float16x4_t b) { return vabd_f16(a, b); }
237[[gnu::always_inline]] nce float16x4_t reciprocal_estimate(float16x4_t a) { return vrecpe_f16(a); }
238[[gnu::always_inline]] nce float16x4_t reciprocal_sqrt_estimate(float16x4_t a) { return vrsqrte_f16(a); }
239[[gnu::always_inline]] nce float16x4_t reciprocal_sqrt_step(float16x4_t a, float16x4_t b) { return vrsqrts_f16(a, b); }
240[[gnu::always_inline]] nce float16x4_t reciprocal_step(float16x4_t a, float16x4_t b) { return vrecps_f16(a, b); }
241[[gnu::always_inline]] nce float16x4_t round(float16x4_t a) { return vrnd_f16(a); }
242[[gnu::always_inline]] nce float16x4_t round_to_nearest_with_ties_away_from_zero(float16x4_t a) { return vrnda_f16(a); }
243[[gnu::always_inline]] nce float16x4_t round_toward_negative_infinity(float16x4_t a) { return vrndm_f16(a); }
244[[gnu::always_inline]] nce float16x4_t round_to_nearest_with_ties_to_even(float16x4_t a) { return vrndn_f16(a); }
245[[gnu::always_inline]] nce float16x4_t round_toward_positive_infinity(float16x4_t a) { return vrndp_f16(a); }
246[[gnu::always_inline]] nce float16x4_t round_inexact(float16x4_t a) { return vrndx_f16(a); }
247[[gnu::always_inline]] nce float16x4_t add(float16x4_t a, float16x4_t b) { return vadd_f16(a, b); }
248template <> [[gnu::always_inline]] nce float16x4_t max(float16x4_t a, float16x4_t b) { return vmax_f16(a, b); }
249template <> [[gnu::always_inline]] nce float16x4_t max_strict(float16x4_t a, float16x4_t b) { return vmaxnm_f16(a, b); }
250template <> [[gnu::always_inline]] nce float16x4_t min(float16x4_t a, float16x4_t b) { return vmin_f16(a, b); }
251template <> [[gnu::always_inline]] nce float16x4_t min_strict(float16x4_t a, float16x4_t b) { return vminnm_f16(a, b); }
252[[gnu::always_inline]] nce float16x4_t multiply(float16x4_t a, float16x4_t b) { return vmul_f16(a, b); }
253template <int lane>[[gnu::always_inline]] nce float16x4_t multiply_lane(float16x4_t a, float16x4_t v) { return vmul_lane_f16(a, v, lane); }
254[[gnu::always_inline]] nce float16x4_t multiply_add_fused(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_f16(a, b, c); }
255[[gnu::always_inline]] nce float16x4_t multiply_subtract_fused(float16x4_t a, float16x4_t b, float16x4_t c) { return vfms_f16(a, b, c); }
256[[gnu::always_inline]] nce float16x4_t pairwise_add(float16x4_t a, float16x4_t b) { return vpadd_f16(a, b); }
257[[gnu::always_inline]] nce float16x4_t pairwise_max(float16x4_t a, float16x4_t b) { return vpmax_f16(a, b); }
258[[gnu::always_inline]] nce float16x4_t pairwise_min(float16x4_t a, float16x4_t b) { return vpmin_f16(a, b); }
259[[gnu::always_inline]] nce float16x4_t subtract(float16x4_t a, float16x4_t b) { return vsub_f16(a, b); }
260[[gnu::always_inline]] nce float16x4_t multiply(float16x4_t a, float16_t n) { return vmul_n_f16(a, n); }
261[[gnu::always_inline]] nce uint16x4_t equal_to_zero(float16x4_t a) { return vceqz_f16(a); }
262[[gnu::always_inline]] nce uint16x4_t greater_than_or_equal_to_zero(float16x4_t a) { return vcgez_f16(a); }
263[[gnu::always_inline]] nce uint16x4_t greater_than_zero(float16x4_t a) { return vcgtz_f16(a); }
264[[gnu::always_inline]] nce uint16x4_t less_than_or_equal_to_zero(float16x4_t a) { return vclez_f16(a); }
265[[gnu::always_inline]] nce uint16x4_t less_than_zero(float16x4_t a) { return vcltz_f16(a); }
266[[gnu::always_inline]] nce uint16x4_t absolute_greater_than_or_equal(float16x4_t a, float16x4_t b) { return vcage_f16(a, b); }
267[[gnu::always_inline]] nce uint16x4_t absolute_greater_than(float16x4_t a, float16x4_t b) { return vcagt_f16(a, b); }
268[[gnu::always_inline]] nce uint16x4_t absolute_less_than_or_equal(float16x4_t a, float16x4_t b) { return vcale_f16(a, b); }
269[[gnu::always_inline]] nce uint16x4_t absolute_less_than(float16x4_t a, float16x4_t b) { return vcalt_f16(a, b); }
270[[gnu::always_inline]] nce uint16x4_t equal(float16x4_t a, float16x4_t b) { return vceq_f16(a, b); }
271[[gnu::always_inline]] nce uint16x4_t greater_than_or_equal(float16x4_t a, float16x4_t b) { return vcge_f16(a, b); }
272[[gnu::always_inline]] nce uint16x4_t greater_than(float16x4_t a, float16x4_t b) { return vcgt_f16(a, b); }
273[[gnu::always_inline]] nce uint16x4_t less_than_or_equal(float16x4_t a, float16x4_t b) { return vcle_f16(a, b); }
274[[gnu::always_inline]] nce uint16x4_t less_than(float16x4_t a, float16x4_t b) { return vclt_f16(a, b); }
275template <> [[gnu::always_inline]] nce int16x4_t convert(float16x4_t a) { return vcvt_s16_f16(a); }
276template <> [[gnu::always_inline]] nce uint16x4_t convert(float16x4_t a) { return vcvt_u16_f16(a); }
277template <> [[gnu::always_inline]] nce int16x4_t convert_round_to_nearest_with_ties_away_from_zero(float16x4_t a) { return vcvta_s16_f16(a); }
278template <> [[gnu::always_inline]] nce uint16x4_t convert_round_to_nearest_with_ties_away_from_zero(float16x4_t a) { return vcvta_u16_f16(a); }
279template <> [[gnu::always_inline]] nce int16x4_t convert_round_toward_negative_infinity(float16x4_t a) { return vcvtm_s16_f16(a); }
280template <> [[gnu::always_inline]] nce uint16x4_t convert_round_toward_negative_infinity(float16x4_t a) { return vcvtm_u16_f16(a); }
281template <> [[gnu::always_inline]] nce int16x4_t convert_round_to_nearest_with_ties_to_even(float16x4_t a) { return vcvtn_s16_f16(a); }
282template <> [[gnu::always_inline]] nce uint16x4_t convert_round_to_nearest_with_ties_to_even(float16x4_t a) { return vcvtn_u16_f16(a); }
283template <> [[gnu::always_inline]] nce int16x4_t convert_round_toward_positive_infinity(float16x4_t a) { return vcvtp_s16_f16(a); }
284template <> [[gnu::always_inline]] nce uint16x4_t convert_round_toward_positive_infinity(float16x4_t a) { return vcvtp_u16_f16(a); }
285template <int n>[[gnu::always_inline]] nce int16x4_t convert(float16x4_t a) { return vcvt_n_s16_f16(a, n); }
286template <int n>[[gnu::always_inline]] nce uint16x4_t convert(float16x4_t a) { return vcvt_n_u16_f16(a, n); }
287[[gnu::always_inline]] nce float16x4_t negate(float16x4_t a) { return vneg_f16(a); }
288[[gnu::always_inline]] nce float16x4_t complex_add_rotate_90(float16x4_t a, float16x4_t b) { return vcadd_rot90_f16(a, b); }
289[[gnu::always_inline]] nce float16x4_t complex_add_rotate_270(float16x4_t a, float16x4_t b) { return vcadd_rot270_f16(a, b); }
290template <> [[gnu::always_inline]] nce float16x4_t complex_multiply_add(float16x4_t r, float16x4_t a, float16x4_t b) { return vcmla_f16(r, a, b); }
291template <> [[gnu::always_inline]] nce float16x4_t complex_multiply_add_rotate_90(float16x4_t r, float16x4_t a, float16x4_t b) { return vcmla_rot90_f16(r, a, b); }
292template <> [[gnu::always_inline]] nce float16x4_t complex_multiply_add_rotate_180(float16x4_t r, float16x4_t a, float16x4_t b) { return vcmla_rot180_f16(r, a, b); }
293template <> [[gnu::always_inline]] nce float16x4_t complex_multiply_add_rotate_270(float16x4_t r, float16x4_t a, float16x4_t b) { return vcmla_rot270_f16(r, a, b); }
294template <int lane>[[gnu::always_inline]] nce float16x4_t complex_multiply_add_lane(float16x4_t r, float16x4_t a, float16x4_t b) { return vcmla_lane_f16(r, a, b, lane); }
295template <int lane>[[gnu::always_inline]] nce float16x4_t complex_multiply_add_rotate_90_lane(float16x4_t r, float16x4_t a, float16x4_t b) { return vcmla_rot90_lane_f16(r, a, b, lane); }
296template <int lane>[[gnu::always_inline]] nce float16x4_t complex_multiply_add_rotate_180_lane(float16x4_t r, float16x4_t a, float16x4_t b) { return vcmla_rot180_lane_f16(r, a, b, lane); }
297template <int lane>[[gnu::always_inline]] nce float16x4_t complex_multiply_add_rotate_270_lane(float16x4_t r, float16x4_t a, float16x4_t b) { return vcmla_rot270_lane_f16(r, a, b, lane); }
298template <int lane>[[gnu::always_inline]] nce float16x4_t complex_multiply_add_lane(float16x4_t r, float16x4_t a, float16x8_t b) { return vcmla_laneq_f16(r, a, b, lane); }
299template <int lane>[[gnu::always_inline]] nce float16x4_t complex_multiply_add_rotate_90_lane(float16x4_t r, float16x4_t a, float16x8_t b) { return vcmla_rot90_laneq_f16(r, a, b, lane); }
300template <int lane>[[gnu::always_inline]] nce float16x4_t complex_multiply_add_rotate_180_lane(float16x4_t r, float16x4_t a, float16x8_t b) { return vcmla_rot180_laneq_f16(r, a, b, lane); }
301template <int lane>[[gnu::always_inline]] nce float16x4_t complex_multiply_add_rotate_270_lane(float16x4_t r, float16x4_t a, float16x8_t b) { return vcmla_rot270_laneq_f16(r, a, b, lane); }
302template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(float16x8_t a) { return vreinterpretq_p64_f16(a); }
303template <> [[gnu::always_inline]] nce poly128_t reinterpret(float16x8_t a) { return vreinterpretq_p128_f16(a); }
304[[gnu::always_inline]] nce float16x8_t absolute(float16x8_t a) { return vabsq_f16(a); }
305[[gnu::always_inline]] nce float16x8_t reciprocal_estimate(float16x8_t a) { return vrecpeq_f16(a); }
306[[gnu::always_inline]] nce float16x8_t reciprocal_sqrt_estimate(float16x8_t a) { return vrsqrteq_f16(a); }
307[[gnu::always_inline]] nce float16x8_t reciprocal_sqrt_step(float16x8_t a, float16x8_t b) { return vrsqrtsq_f16(a, b); }
308[[gnu::always_inline]] nce float16x8_t reciprocal_step(float16x8_t a, float16x8_t b) { return vrecpsq_f16(a, b); }
309[[gnu::always_inline]] nce float16x8_t round(float16x8_t a) { return vrndq_f16(a); }
310[[gnu::always_inline]] nce float16x8_t round_to_nearest_with_ties_away_from_zero(float16x8_t a) { return vrndaq_f16(a); }
311[[gnu::always_inline]] nce float16x8_t round_toward_negative_infinity(float16x8_t a) { return vrndmq_f16(a); }
312[[gnu::always_inline]] nce float16x8_t round_to_nearest_with_ties_to_even(float16x8_t a) { return vrndnq_f16(a); }
313[[gnu::always_inline]] nce float16x8_t round_toward_positive_infinity(float16x8_t a) { return vrndpq_f16(a); }
314[[gnu::always_inline]] nce float16x8_t round_inexact(float16x8_t a) { return vrndxq_f16(a); }
315template <int lane>[[gnu::always_inline]] nce float16x8_t multiply_lane(float16x8_t a, float16x4_t v) { return vmulq_lane_f16(a, v, lane); }
316template <> [[gnu::always_inline]] nce float16x8_t max(float16x8_t a, float16x8_t b) { return vmaxq_f16(a, b); }
317template <> [[gnu::always_inline]] nce float16x8_t max_strict(float16x8_t a, float16x8_t b) { return vmaxnmq_f16(a, b); }
318template <> [[gnu::always_inline]] nce float16x8_t min(float16x8_t a, float16x8_t b) { return vminq_f16(a, b); }
319template <> [[gnu::always_inline]] nce float16x8_t min_strict(float16x8_t a, float16x8_t b) { return vminnmq_f16(a, b); }
320[[gnu::always_inline]] nce float16x8_t multiply_add_fused(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_f16(a, b, c); }
321[[gnu::always_inline]] nce float16x8_t multiply_subtract_fused(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmsq_f16(a, b, c); }
322[[gnu::always_inline]] nce uint16x8_t greater_than_zero(float16x8_t a) { return vcgtzq_f16(a); }
323[[gnu::always_inline]] nce uint16x8_t less_than_or_equal_to_zero(float16x8_t a) { return vclezq_f16(a); }
324[[gnu::always_inline]] nce uint16x8_t less_than_zero(float16x8_t a) { return vcltzq_f16(a); }
325[[gnu::always_inline]] nce uint16x8_t absolute_greater_than_or_equal(float16x8_t a, float16x8_t b) { return vcageq_f16(a, b); }
326[[gnu::always_inline]] nce uint16x8_t absolute_greater_than(float16x8_t a, float16x8_t b) { return vcagtq_f16(a, b); }
327[[gnu::always_inline]] nce uint16x8_t absolute_less_than_or_equal(float16x8_t a, float16x8_t b) { return vcaleq_f16(a, b); }
328[[gnu::always_inline]] nce uint16x8_t absolute_less_than(float16x8_t a, float16x8_t b) { return vcaltq_f16(a, b); }
329[[gnu::always_inline]] nce uint16x8_t equal(float16x8_t a, float16x8_t b) { return vceqq_f16(a, b); }
330[[gnu::always_inline]] nce uint16x8_t greater_than_or_equal(float16x8_t a, float16x8_t b) { return vcgeq_f16(a, b); }
331[[gnu::always_inline]] nce uint16x8_t greater_than(float16x8_t a, float16x8_t b) { return vcgtq_f16(a, b); }
332[[gnu::always_inline]] nce uint16x8_t less_than_or_equal(float16x8_t a, float16x8_t b) { return vcleq_f16(a, b); }
333[[gnu::always_inline]] nce uint16x8_t less_than(float16x8_t a, float16x8_t b) { return vcltq_f16(a, b); }
334template <int lane>[[gnu::always_inline]] nce float16x8_t complex_multiply_add_lane(float16x8_t r, float16x8_t a, float16x4_t b) { return vcmlaq_lane_f16(r, a, b, lane); }
335template <int lane>[[gnu::always_inline]] nce float16x8_t complex_multiply_add_rotate_90_lane(float16x8_t r, float16x8_t a, float16x4_t b) { return vcmlaq_rot90_lane_f16(r, a, b, lane); }
336template <int lane>[[gnu::always_inline]] nce float16x8_t complex_multiply_add_rotate_180_lane(float16x8_t r, float16x8_t a, float16x4_t b) { return vcmlaq_rot180_lane_f16(r, a, b, lane); }
337template <int lane>[[gnu::always_inline]] nce float16x8_t complex_multiply_add_rotate_270_lane(float16x8_t r, float16x8_t a, float16x4_t b) { return vcmlaq_rot270_lane_f16(r, a, b, lane); }
338template <> [[gnu::always_inline]] nce float16x8_t complex_multiply_add(float16x8_t r, float16x8_t a, float16x8_t b) { return vcmlaq_f16(r, a, b); }
339template <> [[gnu::always_inline]] nce float16x8_t complex_multiply_add_rotate_90(float16x8_t r, float16x8_t a, float16x8_t b) { return vcmlaq_rot90_f16(r, a, b); }
340template <> [[gnu::always_inline]] nce float16x8_t complex_multiply_add_rotate_180(float16x8_t r, float16x8_t a, float16x8_t b) { return vcmlaq_rot180_f16(r, a, b); }
341template <> [[gnu::always_inline]] nce float16x8_t complex_multiply_add_rotate_270(float16x8_t r, float16x8_t a, float16x8_t b) { return vcmlaq_rot270_f16(r, a, b); }
342template <int lane>[[gnu::always_inline]] nce float16x8_t complex_multiply_add_lane(float16x8_t r, float16x8_t a, float16x8_t b) { return vcmlaq_laneq_f16(r, a, b, lane); }
343template <int lane>[[gnu::always_inline]] nce float16x8_t complex_multiply_add_rotate_90_lane(float16x8_t r, float16x8_t a, float16x8_t b) { return vcmlaq_rot90_laneq_f16(r, a, b, lane); }
344template <int lane>[[gnu::always_inline]] nce float16x8_t complex_multiply_add_rotate_180_lane(float16x8_t r, float16x8_t a, float16x8_t b) { return vcmlaq_rot180_laneq_f16(r, a, b, lane); }
345template <int lane>[[gnu::always_inline]] nce float16x8_t complex_multiply_add_rotate_270_lane(float16x8_t r, float16x8_t a, float16x8_t b) { return vcmlaq_rot270_laneq_f16(r, a, b, lane); }
346[[gnu::always_inline]] nce float16x8_t multiply(float16x8_t a, float16_t n) { return vmulq_n_f16(a, n); }
347[[gnu::always_inline]] nce uint16x8_t equal_to_zero(float16x8_t a) { return vceqzq_f16(a); }
348[[gnu::always_inline]] nce uint16x8_t greater_than_or_equal_to_zero(float16x8_t a) { return vcgezq_f16(a); }
349[[gnu::always_inline]] nce float32x2_t max_strict(float32x2_t a, float32x2_t b) { return vmaxnm_f32(a, b); }
350[[gnu::always_inline]] nce float32x2_t min_strict(float32x2_t a, float32x2_t b) { return vminnm_f32(a, b); }
351[[gnu::always_inline]] nce float32x2_t round(float32x2_t a) { return vrnd_f32(a); }
352[[gnu::always_inline]] nce float32x2_t round_to_nearest_with_ties_to_even(float32x2_t a) { return vrndn_f32(a); }
353[[gnu::always_inline]] nce float32x2_t round_toward_negative_infinity(float32x2_t a) { return vrndm_f32(a); }
354[[gnu::always_inline]] nce float32x2_t round_toward_positive_infinity(float32x2_t a) { return vrndp_f32(a); }
355[[gnu::always_inline]] nce float32x2_t round_to_nearest_with_ties_away_from_zero(float32x2_t a) { return vrnda_f32(a); }
356[[gnu::always_inline]] nce float32x2_t round_using_current_mode(float32x2_t a) { return vrndi_f32(a); }
357[[gnu::always_inline]] nce float32x2_t round_inexact(float32x2_t a) { return vrndx_f32(a); }
358template <> [[gnu::always_inline]] nce int32x2_t convert_round_to_nearest_with_ties_to_even(float32x2_t a) { return vcvtn_s32_f32(a); }
359template <> [[gnu::always_inline]] nce uint32x2_t convert_round_to_nearest_with_ties_to_even(float32x2_t a) { return vcvtn_u32_f32(a); }
360template <> [[gnu::always_inline]] nce int32x2_t convert_round_toward_negative_infinity(float32x2_t a) { return vcvtm_s32_f32(a); }
361template <> [[gnu::always_inline]] nce uint32x2_t convert_round_toward_negative_infinity(float32x2_t a) { return vcvtm_u32_f32(a); }
362template <> [[gnu::always_inline]] nce int32x2_t convert_round_toward_positive_infinity(float32x2_t a) { return vcvtp_s32_f32(a); }
363template <> [[gnu::always_inline]] nce uint32x2_t convert_round_toward_positive_infinity(float32x2_t a) { return vcvtp_u32_f32(a); }
364template <> [[gnu::always_inline]] nce int32x2_t convert_round_to_nearest_with_ties_away_from_zero(float32x2_t a) { return vcvta_s32_f32(a); }
365template <> [[gnu::always_inline]] nce uint32x2_t convert_round_to_nearest_with_ties_away_from_zero(float32x2_t a) { return vcvta_u32_f32(a); }
366template <> [[gnu::always_inline]] nce poly64x1_t reinterpret(float32x2_t a) { return vreinterpret_p64_f32(a); }
367template <> [[gnu::always_inline]] nce float32x2_t multiply_add_long_fused_low(float32x2_t r, float16x4_t a, float16x4_t b) { return vfmlal_low_f16(r, a, b); }
368template <> [[gnu::always_inline]] nce float32x2_t multiply_subtract_long_fused_low(float32x2_t r, float16x4_t a, float16x4_t b) { return vfmlsl_low_f16(r, a, b); }
369template <> [[gnu::always_inline]] nce float32x2_t multiply_add_long_fused_high(float32x2_t r, float16x4_t a, float16x4_t b) { return vfmlal_high_f16(r, a, b); }
370template <> [[gnu::always_inline]] nce float32x2_t multiply_subtract_long_fused_high(float32x2_t r, float16x4_t a, float16x4_t b) { return vfmlsl_high_f16(r, a, b); }
371template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_add_long_fused_lane_low(float32x2_t r, float16x4_t a, float16x4_t b) { return vfmlal_lane_low_f16(r, a, b, lane); }
372template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_subtract_long_fused_lane_low(float32x2_t r, float16x4_t a, float16x4_t b) { return vfmlsl_lane_low_f16(r, a, b, lane); }
373template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_add_long_fused_lane_high(float32x2_t r, float16x4_t a, float16x4_t b) { return vfmlal_lane_high_f16(r, a, b, lane); }
374template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_subtract_long_fused_lane_high(float32x2_t r, float16x4_t a, float16x4_t b) { return vfmlsl_lane_high_f16(r, a, b, lane); }
375template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_add_long_fused_lane_low(float32x2_t r, float16x4_t a, float16x8_t b) { return vfmlal_laneq_low_f16(r, a, b, lane); }
376template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_subtract_long_fused_lane_low(float32x2_t r, float16x4_t a, float16x8_t b) { return vfmlsl_laneq_low_f16(r, a, b, lane); }
377template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_add_long_fused_lane_high(float32x2_t r, float16x4_t a, float16x8_t b) { return vfmlal_laneq_high_f16(r, a, b, lane); }
378template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_subtract_long_fused_lane_high(float32x2_t r, float16x4_t a, float16x8_t b) { return vfmlsl_laneq_high_f16(r, a, b, lane); }
379[[gnu::always_inline]] nce float32x2_t complex_add_rotate_90(float32x2_t a, float32x2_t b) { return vcadd_rot90_f32(a, b); }
380[[gnu::always_inline]] nce float32x2_t complex_add_rotate_270(float32x2_t a, float32x2_t b) { return vcadd_rot270_f32(a, b); }
381template <> [[gnu::always_inline]] nce float32x2_t complex_multiply_add(float32x2_t r, float32x2_t a, float32x2_t b) { return vcmla_f32(r, a, b); }
382template <> [[gnu::always_inline]] nce float32x2_t complex_multiply_add_rotate_90(float32x2_t r, float32x2_t a, float32x2_t b) { return vcmla_rot90_f32(r, a, b); }
383template <> [[gnu::always_inline]] nce float32x2_t complex_multiply_add_rotate_180(float32x2_t r, float32x2_t a, float32x2_t b) { return vcmla_rot180_f32(r, a, b); }
384template <> [[gnu::always_inline]] nce float32x2_t complex_multiply_add_rotate_270(float32x2_t r, float32x2_t a, float32x2_t b) { return vcmla_rot270_f32(r, a, b); }
385template <int lane>[[gnu::always_inline]] nce float32x2_t complex_multiply_add_lane(float32x2_t r, float32x2_t a, float32x2_t b) { return vcmla_lane_f32(r, a, b, lane); }
386template <int lane>[[gnu::always_inline]] nce float32x2_t complex_multiply_add_rotate_90_lane(float32x2_t r, float32x2_t a, float32x2_t b) { return vcmla_rot90_lane_f32(r, a, b, lane); }
387template <int lane>[[gnu::always_inline]] nce float32x2_t complex_multiply_add_rotate_180_lane(float32x2_t r, float32x2_t a, float32x2_t b) { return vcmla_rot180_lane_f32(r, a, b, lane); }
388template <int lane>[[gnu::always_inline]] nce float32x2_t complex_multiply_add_rotate_270_lane(float32x2_t r, float32x2_t a, float32x2_t b) { return vcmla_rot270_lane_f32(r, a, b, lane); }
389template <int lane>[[gnu::always_inline]] nce float32x2_t complex_multiply_add_lane(float32x2_t r, float32x2_t a, float32x4_t b) { return vcmla_laneq_f32(r, a, b, lane); }
390template <int lane>[[gnu::always_inline]] nce float32x2_t complex_multiply_add_rotate_90_lane(float32x2_t r, float32x2_t a, float32x4_t b) { return vcmla_rot90_laneq_f32(r, a, b, lane); }
391template <int lane>[[gnu::always_inline]] nce float32x2_t complex_multiply_add_rotate_180_lane(float32x2_t r, float32x2_t a, float32x4_t b) { return vcmla_rot180_laneq_f32(r, a, b, lane); }
392template <int lane>[[gnu::always_inline]] nce float32x2_t complex_multiply_add_rotate_270_lane(float32x2_t r, float32x2_t a, float32x4_t b) { return vcmla_rot270_laneq_f32(r, a, b, lane); }
393template <> [[gnu::always_inline]] nce bfloat16x4_t reinterpret(float32x2_t a) { return vreinterpret_bf16_f32(a); }
394template <> [[gnu::always_inline]] nce float32x2_t dot_product(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { return vbfdot_f32(r, a, b); }
395template <int lane>[[gnu::always_inline]] nce float32x2_t dot_product_lane(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { return vbfdot_lane_f32(r, a, b, lane); }
396template <int lane>[[gnu::always_inline]] nce float32x2_t dot_product_lane(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { return vbfdot_laneq_f32(r, a, b, lane); }
397[[gnu::always_inline]] nce float32x4_t round_using_current_mode(float32x4_t a) { return vrndiq_f32(a); }
398template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(float32x4_t a) { return vreinterpretq_p64_f32(a); }
399template <> [[gnu::always_inline]] nce poly128_t reinterpret(float32x4_t a) { return vreinterpretq_p128_f32(a); }
400template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_add_long_fused_lane_low(float32x4_t r, float16x8_t a, float16x4_t b) { return vfmlalq_lane_low_f16(r, a, b, lane); }
401template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_subtract_long_fused_lane_low(float32x4_t r, float16x8_t a, float16x4_t b) { return vfmlslq_lane_low_f16(r, a, b, lane); }
402template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_add_long_fused_lane_high(float32x4_t r, float16x8_t a, float16x4_t b) { return vfmlalq_lane_high_f16(r, a, b, lane); }
403template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_subtract_long_fused_lane_high(float32x4_t r, float16x8_t a, float16x4_t b) { return vfmlslq_lane_high_f16(r, a, b, lane); }
404template <> [[gnu::always_inline]] nce float32x4_t multiply_add_long_fused_low(float32x4_t r, float16x8_t a, float16x8_t b) { return vfmlalq_low_f16(r, a, b); }
405template <> [[gnu::always_inline]] nce float32x4_t multiply_subtract_long_fused_low(float32x4_t r, float16x8_t a, float16x8_t b) { return vfmlslq_low_f16(r, a, b); }
406template <> [[gnu::always_inline]] nce float32x4_t multiply_add_long_fused_high(float32x4_t r, float16x8_t a, float16x8_t b) { return vfmlalq_high_f16(r, a, b); }
407template <> [[gnu::always_inline]] nce float32x4_t multiply_subtract_long_fused_high(float32x4_t r, float16x8_t a, float16x8_t b) { return vfmlslq_high_f16(r, a, b); }
408template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_add_long_fused_lane_low(float32x4_t r, float16x8_t a, float16x8_t b) { return vfmlalq_laneq_low_f16(r, a, b, lane); }
409template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_subtract_long_fused_lane_low(float32x4_t r, float16x8_t a, float16x8_t b) { return vfmlslq_laneq_low_f16(r, a, b, lane); }
410template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_add_long_fused_lane_high(float32x4_t r, float16x8_t a, float16x8_t b) { return vfmlalq_laneq_high_f16(r, a, b, lane); }
411template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_subtract_long_fused_lane_high(float32x4_t r, float16x8_t a, float16x8_t b) { return vfmlslq_laneq_high_f16(r, a, b, lane); }
412template <int lane>[[gnu::always_inline]] nce float32x4_t complex_multiply_add_lane(float32x4_t r, float32x4_t a, float32x2_t b) { return vcmlaq_lane_f32(r, a, b, lane); }
413template <int lane>[[gnu::always_inline]] nce float32x4_t complex_multiply_add_rotate_90_lane(float32x4_t r, float32x4_t a, float32x2_t b) { return vcmlaq_rot90_lane_f32(r, a, b, lane); }
414template <int lane>[[gnu::always_inline]] nce float32x4_t complex_multiply_add_rotate_180_lane(float32x4_t r, float32x4_t a, float32x2_t b) { return vcmlaq_rot180_lane_f32(r, a, b, lane); }
415template <int lane>[[gnu::always_inline]] nce float32x4_t complex_multiply_add_rotate_270_lane(float32x4_t r, float32x4_t a, float32x2_t b) { return vcmlaq_rot270_lane_f32(r, a, b, lane); }
416template <> [[gnu::always_inline]] nce float32x4_t complex_multiply_add(float32x4_t r, float32x4_t a, float32x4_t b) { return vcmlaq_f32(r, a, b); }
417template <> [[gnu::always_inline]] nce float32x4_t complex_multiply_add_rotate_90(float32x4_t r, float32x4_t a, float32x4_t b) { return vcmlaq_rot90_f32(r, a, b); }
418template <> [[gnu::always_inline]] nce float32x4_t complex_multiply_add_rotate_180(float32x4_t r, float32x4_t a, float32x4_t b) { return vcmlaq_rot180_f32(r, a, b); }
419template <> [[gnu::always_inline]] nce float32x4_t complex_multiply_add_rotate_270(float32x4_t r, float32x4_t a, float32x4_t b) { return vcmlaq_rot270_f32(r, a, b); }
420template <int lane>[[gnu::always_inline]] nce float32x4_t complex_multiply_add_lane(float32x4_t r, float32x4_t a, float32x4_t b) { return vcmlaq_laneq_f32(r, a, b, lane); }
421template <int lane>[[gnu::always_inline]] nce float32x4_t complex_multiply_add_rotate_90_lane(float32x4_t r, float32x4_t a, float32x4_t b) { return vcmlaq_rot90_laneq_f32(r, a, b, lane); }
422template <int lane>[[gnu::always_inline]] nce float32x4_t complex_multiply_add_rotate_180_lane(float32x4_t r, float32x4_t a, float32x4_t b) { return vcmlaq_rot180_laneq_f32(r, a, b, lane); }
423template <int lane>[[gnu::always_inline]] nce float32x4_t complex_multiply_add_rotate_270_lane(float32x4_t r, float32x4_t a, float32x4_t b) { return vcmlaq_rot270_laneq_f32(r, a, b, lane); }
424template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(float32x4_t a) { return vreinterpretq_bf16_f32(a); }
425template <> [[gnu::always_inline]] nce bfloat16x4_t convert(float32x4_t a) { return vcvt_bf16_f32(a); }
426template <> [[gnu::always_inline]] nce bfloat16x8_t convert_low(float32x4_t a) { return vcvtq_low_bf16_f32(a); }
427template <> [[gnu::always_inline]] nce float32x4_t dot_product(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfdotq_f32(r, a, b); }
428template <int lane>[[gnu::always_inline]] nce float32x4_t dot_product_lane(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfdotq_laneq_f32(r, a, b, lane); }
429template <int lane>[[gnu::always_inline]] nce float32x4_t dot_product_lane(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfdotq_lane_f32(r, a, b, lane); }
430[[gnu::always_inline]] nce float32x4_t matrix_multiply_add(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmmlaq_f32(r, a, b); }
431template <> [[gnu::always_inline]] nce float32x4_t multiply_add_long_widen_bottom(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_f32(r, a, b); }
432template <> [[gnu::always_inline]] nce float32x4_t multiply_add_long_widen_top(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_f32(r, a, b); }
433template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_add_long_widen_bottom_lane(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlalbq_lane_f32(r, a, b, lane); }
434template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_add_long_widen_bottom_lane(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_laneq_f32(r, a, b, lane); }
435template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_add_long_widen_top_lane(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlaltq_lane_f32(r, a, b, lane); }
436template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_add_long_widen_top_lane(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_laneq_f32(r, a, b, lane); }
437template <> [[gnu::always_inline]] nce poly64x1_t reinterpret(poly8x8_t a) { return vreinterpret_p64_p8(a); }
438template <> [[gnu::always_inline]] nce bfloat16x4_t reinterpret(poly8x8_t a) { return vreinterpret_bf16_p8(a); }
439template <> [[gnu::always_inline]] nce poly64x1_t reinterpret(poly16x4_t a) { return vreinterpret_p64_p16(a); }
440template <> [[gnu::always_inline]] nce bfloat16x4_t reinterpret(poly16x4_t a) { return vreinterpret_bf16_p16(a); }
441[[gnu::always_inline]] nce float64x1_t round_to_nearest_with_ties_to_even(float64x1_t a) { return vrndn_f64(a); }
442[[gnu::always_inline]] nce float64x2_t round_to_nearest_with_ties_to_even(float64x2_t a) { return vrndnq_f64(a); }
443[[gnu::always_inline]] nce float32_t round_to_nearest_with_ties_to_even(float32_t a) { return vrndns_f32(a); }
444[[gnu::always_inline]] nce uint64x1_t equal(poly64x1_t a, poly64x1_t b) { return vceq_p64(a, b); }
445[[gnu::always_inline]] nce uint64x2_t equal(poly64x2_t a, poly64x2_t b) { return vceqq_p64(a, b); }
446[[gnu::always_inline]] nce uint64x1_t equal_to_zero(poly64x1_t a) { return vceqz_p64(a); }
447[[gnu::always_inline]] nce uint64x2_t equal_to_zero(poly64x2_t a) { return vceqzq_p64(a); }
448[[gnu::always_inline]] nce uint64x1_t compare_test_nonzero(poly64x1_t a, poly64x1_t b) { return vtst_p64(a, b); }
449[[gnu::always_inline]] nce uint64x2_t compare_test_nonzero(poly64x2_t a, poly64x2_t b) { return vtstq_p64(a, b); }
450template <int n>[[gnu::always_inline]] nce poly64x1_t shift_left_insert(poly64x1_t a, poly64x1_t b) { return vsli_n_p64(a, b, n); }
451template <int n>[[gnu::always_inline]] nce poly64x2_t shift_left_insert(poly64x2_t a, poly64x2_t b) { return vsliq_n_p64(a, b, n); }
452template <int n>[[gnu::always_inline]] nce poly64x1_t shift_right_insert(poly64x1_t a, poly64x1_t b) { return vsri_n_p64(a, b, n); }
453template <int n>[[gnu::always_inline]] nce poly64x2_t shift_right_insert(poly64x2_t a, poly64x2_t b) { return vsriq_n_p64(a, b, n); }
454template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(poly64x1_t a) { return vreinterpret_u64_p64(a); }
455template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(poly8x16_t a) { return vreinterpretq_p64_p8(a); }
456template <> [[gnu::always_inline]] nce poly128_t reinterpret(poly8x16_t a) { return vreinterpretq_p128_p8(a); }
457template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(poly16x8_t a) { return vreinterpretq_p64_p16(a); }
458template <> [[gnu::always_inline]] nce poly128_t reinterpret(poly16x8_t a) { return vreinterpretq_p128_p16(a); }
459template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(int64x2_t a) { return vreinterpretq_p64_s64(a); }
460template <> [[gnu::always_inline]] nce poly128_t reinterpret(int64x2_t a) { return vreinterpretq_p128_s64(a); }
461template <> [[gnu::always_inline]] nce uint64x2_t reinterpret(poly64x2_t a) { return vreinterpretq_u64_p64(a); }
462template <> [[gnu::always_inline]] nce int8x8_t reinterpret(poly64x1_t a) { return vreinterpret_s8_p64(a); }
463template <> [[gnu::always_inline]] nce int16x4_t reinterpret(poly64x1_t a) { return vreinterpret_s16_p64(a); }
464template <> [[gnu::always_inline]] nce int32x2_t reinterpret(poly64x1_t a) { return vreinterpret_s32_p64(a); }
465template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(poly64x1_t a) { return vreinterpret_u8_p64(a); }
466template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(poly64x1_t a) { return vreinterpret_u16_p64(a); }
467template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(poly64x1_t a) { return vreinterpret_u32_p64(a); }
468template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(poly64x1_t a) { return vreinterpret_p8_p64(a); }
469template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(poly64x1_t a) { return vreinterpret_p16_p64(a); }
470//template <> [[gnu::always_inline]] nce mfloat8x8_t reinterpret(poly64x1_t a) { return vreinterpret_mf8_p64(a); }
471template <> [[gnu::always_inline]] nce int64x1_t reinterpret(poly64x1_t a) { return vreinterpret_s64_p64(a); }
472template <> [[gnu::always_inline]] nce float16x4_t reinterpret(poly64x1_t a) { return vreinterpret_f16_p64(a); }
473template <> [[gnu::always_inline]] nce int8x16_t reinterpret(poly64x2_t a) { return vreinterpretq_s8_p64(a); }
474template <> [[gnu::always_inline]] nce int16x8_t reinterpret(poly64x2_t a) { return vreinterpretq_s16_p64(a); }
475template <> [[gnu::always_inline]] nce int32x4_t reinterpret(poly64x2_t a) { return vreinterpretq_s32_p64(a); }
476template <> [[gnu::always_inline]] nce uint8x16_t reinterpret(poly64x2_t a) { return vreinterpretq_u8_p64(a); }
477template <> [[gnu::always_inline]] nce uint16x8_t reinterpret(poly64x2_t a) { return vreinterpretq_u16_p64(a); }
478template <> [[gnu::always_inline]] nce uint32x4_t reinterpret(poly64x2_t a) { return vreinterpretq_u32_p64(a); }
479template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(poly64x2_t a) { return vreinterpretq_p8_p64(a); }
480template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(poly64x2_t a) { return vreinterpretq_p16_p64(a); }
481template <> [[gnu::always_inline]] nce int64x2_t reinterpret(poly64x2_t a) { return vreinterpretq_s64_p64(a); }
482template <> [[gnu::always_inline]] nce float16x8_t reinterpret(poly64x2_t a) { return vreinterpretq_f16_p64(a); }
483template <> [[gnu::always_inline]] nce int8x16_t reinterpret(poly128_t a) { return vreinterpretq_s8_p128(a); }
484template <> [[gnu::always_inline]] nce int16x8_t reinterpret(poly128_t a) { return vreinterpretq_s16_p128(a); }
485template <> [[gnu::always_inline]] nce int32x4_t reinterpret(poly128_t a) { return vreinterpretq_s32_p128(a); }
486template <> [[gnu::always_inline]] nce uint8x16_t reinterpret(poly128_t a) { return vreinterpretq_u8_p128(a); }
487template <> [[gnu::always_inline]] nce uint16x8_t reinterpret(poly128_t a) { return vreinterpretq_u16_p128(a); }
488template <> [[gnu::always_inline]] nce uint32x4_t reinterpret(poly128_t a) { return vreinterpretq_u32_p128(a); }
489template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(poly128_t a) { return vreinterpretq_p8_p128(a); }
490template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(poly128_t a) { return vreinterpretq_p16_p128(a); }
491template <> [[gnu::always_inline]] nce uint64x2_t reinterpret(poly128_t a) { return vreinterpretq_u64_p128(a); }
492template <> [[gnu::always_inline]] nce int64x2_t reinterpret(poly128_t a) { return vreinterpretq_s64_p128(a); }
493template <> [[gnu::always_inline]] nce float16x8_t reinterpret(poly128_t a) { return vreinterpretq_f16_p128(a); }
494#ifdef __clang__
495[[gnu::always_inline]] nce poly64x1_t bitwise_select(poly64x1_t a, poly64x1_t b, poly64x1_t c) { return vbsl_p64(a, b, c); }
496[[gnu::always_inline]] nce poly64x2_t bitwise_select(poly64x2_t a, poly64x2_t b, poly64x2_t c) { return vbslq_p64(a, b, c); }
497#else
498[[gnu::always_inline]] nce poly64x1_t bitwise_select(poly64x1_t a, poly64x1_t b, poly64x1_t c) { return vbsl_p64(*(uint64x1_t*)&a, b, c); }
499[[gnu::always_inline]] nce poly64x2_t bitwise_select(poly64x2_t a, poly64x2_t b, poly64x2_t c) { return vbslq_p64(*(uint64x2_t*)&a, b, c); }
500#endif
501template <int lane1, int lane2>[[gnu::always_inline]] nce poly64x1_t copy_lane(poly64x1_t a, poly64x1_t b) { return vcopy_lane_p64(a, lane1, b, lane2); }
502template <int lane1, int lane2>[[gnu::always_inline]] nce poly64x2_t copy_lane(poly64x2_t a, poly64x1_t b) { return vcopyq_lane_p64(a, lane1, b, lane2); }
503template <int lane1, int lane2>[[gnu::always_inline]] nce poly64x1_t copy_lane(poly64x1_t a, poly64x2_t b) { return vcopy_laneq_p64(a, lane1, b, lane2); }
504template <int lane1, int lane2>[[gnu::always_inline]] nce poly64x2_t copy_lane(poly64x2_t a, poly64x2_t b) { return vcopyq_laneq_p64(a, lane1, b, lane2); }
505template <> [[gnu::always_inline]] nce poly64x1_t create(uint64_t a) { return vcreate_p64(a); }
506template <> [[gnu::always_inline]] nce poly64x1_t duplicate(poly64_t value) { return vdup_n_p64(value); }
507template <> [[gnu::always_inline]] nce poly64x2_t duplicate(poly64_t value) { return vdupq_n_p64(value); }
508template <int lane>[[gnu::always_inline]] nce poly64x1_t duplicate_lane(poly64x1_t vec) { return vdup_lane_p64(vec, lane); }
509template <int lane>[[gnu::always_inline]] nce poly64x2_t duplicate_lane_quad(poly64x1_t vec) { return vdupq_lane_p64(vec, lane); }
510[[gnu::always_inline]] nce poly64x2_t combine(poly64x1_t low, poly64x1_t high) { return vcombine_p64(low, high); }
511template <> [[gnu::always_inline]] nce poly64x1_t get_high(poly64x2_t a) { return vget_high_p64(a); }
512template <> [[gnu::always_inline]] nce poly64x1_t get_low(poly64x2_t a) { return vget_low_p64(a); }
513template <int lane>[[gnu::always_inline]] nce poly64_t get_lane(poly64x1_t v) { return vget_lane_p64(v, lane); }
514template <int lane>[[gnu::always_inline]] nce poly64_t get_lane(poly64x2_t v) { return vgetq_lane_p64(v, lane); }
515template <int n>[[gnu::always_inline]] nce poly64x1_t extract(poly64x1_t a, poly64x1_t b) { return vext_p64(a, b, n); }
516template <int n>[[gnu::always_inline]] nce poly64x2_t extract(poly64x2_t a, poly64x2_t b) { return vextq_p64(a, b, n); }
517template <int lane>[[gnu::always_inline]] nce poly64x1_t set_lane(poly64_t a, poly64x1_t v) { return vset_lane_p64(a, v, lane); }
518template <int lane>[[gnu::always_inline]] nce poly64x2_t set_lane(poly64_t a, poly64x2_t v) { return vsetq_lane_p64(a, v, lane); }
519template <> [[gnu::always_inline]] inline poly64x1_t load1(poly64_t const *ptr) { return vld1_p64(ptr); }
520template <> [[gnu::always_inline]] inline poly64x2_t load1(poly64_t const *ptr) { return vld1q_p64(ptr); }
521template <int lane>[[gnu::always_inline]] nce poly64x1_t load1_lane(poly64_t const *ptr, poly64x1_t src) { return vld1_lane_p64(ptr, src, lane); }
522template <int lane>[[gnu::always_inline]] nce poly64x2_t load1_lane(poly64_t const *ptr, poly64x2_t src) { return vld1q_lane_p64(ptr, src, lane); }
523template <> [[gnu::always_inline]] inline poly64x1_t load1_duplicate(poly64_t const *ptr) { return vld1_dup_p64(ptr); }
524template <> [[gnu::always_inline]] inline poly64x2_t load1_duplicate(poly64_t const *ptr) { return vld1q_dup_p64(ptr); }
525template <> [[gnu::always_inline]] inline poly64x1x2_t load2(poly64_t const *ptr) { return vld2_p64(ptr); }
526template <> [[gnu::always_inline]] inline poly64x1x3_t load3(poly64_t const *ptr) { return vld3_p64(ptr); }
527template <> [[gnu::always_inline]] inline poly64x1x4_t load4(poly64_t const *ptr) { return vld4_p64(ptr); }
528template <> [[gnu::always_inline]] inline poly64x1x2_t load2_duplicate(poly64_t const *ptr) { return vld2_dup_p64(ptr); }
529template <> [[gnu::always_inline]] inline poly64x1x3_t load3_duplicate(poly64_t const *ptr) { return vld3_dup_p64(ptr); }
530template <> [[gnu::always_inline]] inline poly64x1x4_t load4_duplicate(poly64_t const *ptr) { return vld4_dup_p64(ptr); }
531template <> [[gnu::always_inline]] inline poly64x1x2_t load1_x2(poly64_t const *ptr) { return vld1_p64_x2(ptr); }
532template <> [[gnu::always_inline]] inline poly64x2x2_t load1_x2(poly64_t const *ptr) { return vld1q_p64_x2(ptr); }
533template <> [[gnu::always_inline]] inline poly64x1x3_t load1_x3(poly64_t const *ptr) { return vld1_p64_x3(ptr); }
534template <> [[gnu::always_inline]] inline poly64x2x3_t load1_x3(poly64_t const *ptr) { return vld1q_p64_x3(ptr); }
535template <> [[gnu::always_inline]] inline poly64x1x4_t load1_x4(poly64_t const *ptr) { return vld1_p64_x4(ptr); }
536template <> [[gnu::always_inline]] inline poly64x2x4_t load1_x4(poly64_t const *ptr) { return vld1q_p64_x4(ptr); }
537[[gnu::always_inline]] inline poly128_t load_register(poly128_t const *ptr) { return vldrq_p128(ptr); }
538template <> [[gnu::always_inline]] inline void store1(poly64_t *ptr, poly64x1_t val) { return vst1_p64(ptr, val); }
539template <> [[gnu::always_inline]] inline void store1(poly64_t *ptr, poly64x2_t val) { return vst1q_p64(ptr, val); }
540template <int lane>[[gnu::always_inline]] nce void store1_lane(poly64_t *ptr, poly64x1_t val) { return vst1_lane_p64(ptr, val, lane); }
541template <int lane>[[gnu::always_inline]] nce void store1_lane(poly64_t *ptr, poly64x2_t val) { return vst1q_lane_p64(ptr, val, lane); }
542[[gnu::always_inline]] inline void store2(poly64_t *ptr, poly64x1x2_t val) { return vst2_p64(ptr, val); }
543[[gnu::always_inline]] inline void store3(poly64_t *ptr, poly64x1x3_t val) { return vst3_p64(ptr, val); }
544[[gnu::always_inline]] inline void store4(poly64_t *ptr, poly64x1x4_t val) { return vst4_p64(ptr, val); }
545[[gnu::always_inline]] inline void store1_x2(poly64_t *ptr, poly64x1x2_t val) { return vst1_p64_x2(ptr, val); }
546[[gnu::always_inline]] inline void store1_x2(poly64_t *ptr, poly64x2x2_t val) { return vst1q_p64_x2(ptr, val); }
547[[gnu::always_inline]] inline void store1_x3(poly64_t *ptr, poly64x1x3_t val) { return vst1_p64_x3(ptr, val); }
548[[gnu::always_inline]] inline void store1_x4(poly64_t *ptr, poly64x1x4_t val) { return vst1_p64_x4(ptr, val); }
549[[gnu::always_inline]] inline void store1_x4(poly64_t *ptr, poly64x2x4_t val) { return vst1q_p64_x4(ptr, val); }
550[[gnu::always_inline]] inline void store_register(poly128_t *ptr, poly128_t val) { return vstrq_p128(ptr, val); }
551[[gnu::always_inline]] nce uint32_t sha1_fixed_rotate(uint32_t hash_e) { return vsha1h_u32(hash_e); }
552[[gnu::always_inline]] nce poly128_t multiply_long(poly64_t a, poly64_t b) { return vmull_p64(a, b); }
553[[gnu::always_inline]] nce poly128_t multiply_long_high(poly64x2_t a, poly64x2_t b) { return vmull_high_p64(a, b); }
554
555template <> [[gnu::always_inline]] nce bfloat16x4_t create(uint64_t a) { return vcreate_bf16(a); }
556template <> [[gnu::always_inline]] nce bfloat16x4_t duplicate(bfloat16_t value) { return vdup_n_bf16(value); }
557template <> [[gnu::always_inline]] nce bfloat16x8_t duplicate(bfloat16_t value) { return vdupq_n_bf16(value); }
558template <int lane>[[gnu::always_inline]] nce bfloat16x4_t duplicate_lane(bfloat16x4_t vec) { return vdup_lane_bf16(vec, lane); }
559template <int lane>[[gnu::always_inline]] nce bfloat16x8_t duplicate_lane_quad(bfloat16x4_t vec) { return vdupq_lane_bf16(vec, lane); }
560template <int lane>[[gnu::always_inline]] nce bfloat16x4_t duplicate_lane(bfloat16x8_t vec) { return vdup_laneq_bf16(vec, lane); }
561template <int lane>[[gnu::always_inline]] nce bfloat16x8_t duplicate_lane_quad(bfloat16x8_t vec) { return vdupq_laneq_bf16(vec, lane); }
562template <int lane>[[gnu::always_inline]] nce bfloat16_t duplicate_lane(bfloat16x4_t vec) { return vduph_lane_bf16(vec, lane); }
563template <int lane>[[gnu::always_inline]] nce bfloat16_t duplicate_lane(bfloat16x8_t vec) { return vduph_laneq_bf16(vec, lane); }
564[[gnu::always_inline]] nce bfloat16x8_t combine(bfloat16x4_t low, bfloat16x4_t high) { return vcombine_bf16(low, high); }
565template <> [[gnu::always_inline]] nce bfloat16x4_t get_high(bfloat16x8_t a) { return vget_high_bf16(a); }
566template <> [[gnu::always_inline]] nce bfloat16x4_t get_low(bfloat16x8_t a) { return vget_low_bf16(a); }
567template <int lane>[[gnu::always_inline]] nce bfloat16_t get_lane(bfloat16x4_t v) { return vget_lane_bf16(v, lane); }
568template <int lane>[[gnu::always_inline]] nce bfloat16_t get_lane(bfloat16x8_t v) { return vgetq_lane_bf16(v, lane); }
569template <int lane>[[gnu::always_inline]] nce bfloat16x4_t set_lane(bfloat16_t a, bfloat16x4_t v) { return vset_lane_bf16(a, v, lane); }
570template <int lane>[[gnu::always_inline]] nce bfloat16x8_t set_lane(bfloat16_t a, bfloat16x8_t v) { return vsetq_lane_bf16(a, v, lane); }
571template <> [[gnu::always_inline]] inline bfloat16x4_t load1(bfloat16_t const *ptr) { return vld1_bf16(ptr); }
572template <> [[gnu::always_inline]] inline bfloat16x8_t load1(bfloat16_t const *ptr) { return vld1q_bf16(ptr); }
573template <int lane>[[gnu::always_inline]] nce bfloat16x4_t load1_lane(bfloat16_t const *ptr, bfloat16x4_t src) { return vld1_lane_bf16(ptr, src, lane); }
574template <int lane>[[gnu::always_inline]] nce bfloat16x8_t load1_lane(bfloat16_t const *ptr, bfloat16x8_t src) { return vld1q_lane_bf16(ptr, src, lane); }
575template <> [[gnu::always_inline]] inline bfloat16x4_t load1_duplicate(bfloat16_t const *ptr) { return vld1_dup_bf16(ptr); }
576template <> [[gnu::always_inline]] inline bfloat16x8_t load1_duplicate(bfloat16_t const *ptr) { return vld1q_dup_bf16(ptr); }
577template <> [[gnu::always_inline]] inline bfloat16x4x2_t load2(bfloat16_t const *ptr) { return vld2_bf16(ptr); }
578template <> [[gnu::always_inline]] inline bfloat16x8x2_t load2(bfloat16_t const *ptr) { return vld2q_bf16(ptr); }
579template <> [[gnu::always_inline]] inline bfloat16x4x3_t load3(bfloat16_t const *ptr) { return vld3_bf16(ptr); }
580template <> [[gnu::always_inline]] inline bfloat16x8x3_t load3(bfloat16_t const *ptr) { return vld3q_bf16(ptr); }
581template <> [[gnu::always_inline]] inline bfloat16x4x4_t load4(bfloat16_t const *ptr) { return vld4_bf16(ptr); }
582template <> [[gnu::always_inline]] inline bfloat16x8x4_t load4(bfloat16_t const *ptr) { return vld4q_bf16(ptr); }
583template <> [[gnu::always_inline]] inline bfloat16x4x2_t load2_duplicate(bfloat16_t const *ptr) { return vld2_dup_bf16(ptr); }
584template <> [[gnu::always_inline]] inline bfloat16x8x2_t load2_duplicate(bfloat16_t const *ptr) { return vld2q_dup_bf16(ptr); }
585template <> [[gnu::always_inline]] inline bfloat16x4x3_t load3_duplicate(bfloat16_t const *ptr) { return vld3_dup_bf16(ptr); }
586template <> [[gnu::always_inline]] inline bfloat16x8x3_t load3_duplicate(bfloat16_t const *ptr) { return vld3q_dup_bf16(ptr); }
587template <> [[gnu::always_inline]] inline bfloat16x4x4_t load4_duplicate(bfloat16_t const *ptr) { return vld4_dup_bf16(ptr); }
588template <> [[gnu::always_inline]] inline bfloat16x8x4_t load4_duplicate(bfloat16_t const *ptr) { return vld4q_dup_bf16(ptr); }
589template <int lane>[[gnu::always_inline]] nce bfloat16x4x2_t load2_lane(bfloat16_t const *ptr, bfloat16x4x2_t src) { return vld2_lane_bf16(ptr, src, lane); }
590template <int lane>[[gnu::always_inline]] nce bfloat16x8x2_t load2_lane(bfloat16_t const *ptr, bfloat16x8x2_t src) { return vld2q_lane_bf16(ptr, src, lane); }
591template <int lane>[[gnu::always_inline]] nce bfloat16x4x3_t load3_lane(bfloat16_t const *ptr, bfloat16x4x3_t src) { return vld3_lane_bf16(ptr, src, lane); }
592template <int lane>[[gnu::always_inline]] nce bfloat16x8x3_t load3_lane(bfloat16_t const *ptr, bfloat16x8x3_t src) { return vld3q_lane_bf16(ptr, src, lane); }
593template <int lane>[[gnu::always_inline]] nce bfloat16x4x4_t load4_lane(bfloat16_t const *ptr, bfloat16x4x4_t src) { return vld4_lane_bf16(ptr, src, lane); }
594template <int lane>[[gnu::always_inline]] nce bfloat16x8x4_t load4_lane(bfloat16_t const *ptr, bfloat16x8x4_t src) { return vld4q_lane_bf16(ptr, src, lane); }
595template <> [[gnu::always_inline]] inline bfloat16x4x2_t load1_x2(bfloat16_t const *ptr) { return vld1_bf16_x2(ptr); }
596template <> [[gnu::always_inline]] inline bfloat16x8x2_t load1_x2(bfloat16_t const *ptr) { return vld1q_bf16_x2(ptr); }
597template <> [[gnu::always_inline]] inline bfloat16x4x3_t load1_x3(bfloat16_t const *ptr) { return vld1_bf16_x3(ptr); }
598template <> [[gnu::always_inline]] inline bfloat16x8x3_t load1_x3(bfloat16_t const *ptr) { return vld1q_bf16_x3(ptr); }
599template <> [[gnu::always_inline]] inline bfloat16x4x4_t load1_x4(bfloat16_t const *ptr) { return vld1_bf16_x4(ptr); }
600template <> [[gnu::always_inline]] inline bfloat16x8x4_t load1_x4(bfloat16_t const *ptr) { return vld1q_bf16_x4(ptr); }
601template <> [[gnu::always_inline]] inline void store1(bfloat16_t *ptr, bfloat16x4_t val) { return vst1_bf16(ptr, val); }
602template <> [[gnu::always_inline]] inline void store1(bfloat16_t *ptr, bfloat16x8_t val) { return vst1q_bf16(ptr, val); }
603template <int lane>[[gnu::always_inline]] nce void store1_lane(bfloat16_t *ptr, bfloat16x4_t val) { return vst1_lane_bf16(ptr, val, lane); }
604template <int lane>[[gnu::always_inline]] nce void store1_lane(bfloat16_t *ptr, bfloat16x8_t val) { return vst1q_lane_bf16(ptr, val, lane); }
605template <> [[gnu::always_inline]] inline void store2(bfloat16_t *ptr, bfloat16x4x2_t val) { return vst2_bf16(ptr, val); }
606template <> [[gnu::always_inline]] inline void store2(bfloat16_t *ptr, bfloat16x8x2_t val) { return vst2q_bf16(ptr, val); }
607template <> [[gnu::always_inline]] inline void store3(bfloat16_t *ptr, bfloat16x4x3_t val) { return vst3_bf16(ptr, val); }
608template <> [[gnu::always_inline]] inline void store3(bfloat16_t *ptr, bfloat16x8x3_t val) { return vst3q_bf16(ptr, val); }
609template <> [[gnu::always_inline]] inline void store4(bfloat16_t *ptr, bfloat16x4x4_t val) { return vst4_bf16(ptr, val); }
610template <> [[gnu::always_inline]] inline void store4(bfloat16_t *ptr, bfloat16x8x4_t val) { return vst4q_bf16(ptr, val); }
611template <int lane>[[gnu::always_inline]] nce void store2_lane(bfloat16_t *ptr, bfloat16x4x2_t val) { return vst2_lane_bf16(ptr, val, lane); }
612template <int lane>[[gnu::always_inline]] nce void store2_lane(bfloat16_t *ptr, bfloat16x8x2_t val) { return vst2q_lane_bf16(ptr, val, lane); }
613template <int lane>[[gnu::always_inline]] nce void store3_lane(bfloat16_t *ptr, bfloat16x4x3_t val) { return vst3_lane_bf16(ptr, val, lane); }
614template <int lane>[[gnu::always_inline]] nce void store3_lane(bfloat16_t *ptr, bfloat16x8x3_t val) { return vst3q_lane_bf16(ptr, val, lane); }
615template <int lane>[[gnu::always_inline]] nce void store4_lane(bfloat16_t *ptr, bfloat16x4x4_t val) { return vst4_lane_bf16(ptr, val, lane); }
616template <int lane>[[gnu::always_inline]] nce void store4_lane(bfloat16_t *ptr, bfloat16x8x4_t val) { return vst4q_lane_bf16(ptr, val, lane); }
617[[gnu::always_inline]] inline void store1_x2(bfloat16_t *ptr, bfloat16x4x2_t val) { return vst1_bf16_x2(ptr, val); }
618[[gnu::always_inline]] inline void store1_x2(bfloat16_t *ptr, bfloat16x8x2_t val) { return vst1q_bf16_x2(ptr, val); }
619[[gnu::always_inline]] inline void store1_x3(bfloat16_t *ptr, bfloat16x4x3_t val) { return vst1_bf16_x3(ptr, val); }
620[[gnu::always_inline]] inline void store1_x3(bfloat16_t *ptr, bfloat16x8x3_t val) { return vst1q_bf16_x3(ptr, val); }
621[[gnu::always_inline]] inline void store1_x4(bfloat16_t *ptr, bfloat16x4x4_t val) { return vst1_bf16_x4(ptr, val); }
622[[gnu::always_inline]] inline void store1_x4(bfloat16_t *ptr, bfloat16x8x4_t val) { return vst1q_bf16_x4(ptr, val); }
623[[gnu::always_inline]] nce bfloat16x4_t reinterpret(int64x1_t a) { return vreinterpret_bf16_s64(a); }
624template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(poly8x16_t a) { return vreinterpretq_bf16_p8(a); }
625template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(poly16x8_t a) { return vreinterpretq_bf16_p16(a); }
626template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(int64x2_t a) { return vreinterpretq_bf16_s64(a); }
627template <> [[gnu::always_inline]] nce bfloat16x4_t reinterpret(poly64x1_t a) { return vreinterpret_bf16_p64(a); }
628template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(poly64x2_t a) { return vreinterpretq_bf16_p64(a); }
629template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(poly128_t a) { return vreinterpretq_bf16_p128(a); }
630template <> [[gnu::always_inline]] nce int8x8_t reinterpret(bfloat16x4_t a) { return vreinterpret_s8_bf16(a); }
631template <> [[gnu::always_inline]] nce int16x4_t reinterpret(bfloat16x4_t a) { return vreinterpret_s16_bf16(a); }
632template <> [[gnu::always_inline]] nce int32x2_t reinterpret(bfloat16x4_t a) { return vreinterpret_s32_bf16(a); }
633template <> [[gnu::always_inline]] nce float32x2_t reinterpret(bfloat16x4_t a) { return vreinterpret_f32_bf16(a); }
634template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(bfloat16x4_t a) { return vreinterpret_u8_bf16(a); }
635template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(bfloat16x4_t a) { return vreinterpret_u16_bf16(a); }
636template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(bfloat16x4_t a) { return vreinterpret_u32_bf16(a); }
637template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(bfloat16x4_t a) { return vreinterpret_p8_bf16(a); }
638template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(bfloat16x4_t a) { return vreinterpret_p16_bf16(a); }
639template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(bfloat16x4_t a) { return vreinterpret_u64_bf16(a); }
640template <> [[gnu::always_inline]] nce int64x1_t reinterpret(bfloat16x4_t a) { return vreinterpret_s64_bf16(a); }
641template <> [[gnu::always_inline]] nce poly64x1_t reinterpret(bfloat16x4_t a) { return vreinterpret_p64_bf16(a); }
642template <> [[gnu::always_inline]] nce int8x16_t reinterpret(bfloat16x8_t a) { return vreinterpretq_s8_bf16(a); }
643template <> [[gnu::always_inline]] nce int16x8_t reinterpret(bfloat16x8_t a) { return vreinterpretq_s16_bf16(a); }
644template <> [[gnu::always_inline]] nce int32x4_t reinterpret(bfloat16x8_t a) { return vreinterpretq_s32_bf16(a); }
645template <> [[gnu::always_inline]] nce float32x4_t reinterpret(bfloat16x8_t a) { return vreinterpretq_f32_bf16(a); }
646template <> [[gnu::always_inline]] nce uint8x16_t reinterpret(bfloat16x8_t a) { return vreinterpretq_u8_bf16(a); }
647template <> [[gnu::always_inline]] nce uint16x8_t reinterpret(bfloat16x8_t a) { return vreinterpretq_u16_bf16(a); }
648template <> [[gnu::always_inline]] nce uint32x4_t reinterpret(bfloat16x8_t a) { return vreinterpretq_u32_bf16(a); }
649template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(bfloat16x8_t a) { return vreinterpretq_p8_bf16(a); }
650template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(bfloat16x8_t a) { return vreinterpretq_p16_bf16(a); }
651template <> [[gnu::always_inline]] nce uint64x2_t reinterpret(bfloat16x8_t a) { return vreinterpretq_u64_bf16(a); }
652template <> [[gnu::always_inline]] nce int64x2_t reinterpret(bfloat16x8_t a) { return vreinterpretq_s64_bf16(a); }
653template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(bfloat16x8_t a) { return vreinterpretq_p64_bf16(a); }
654template <> [[gnu::always_inline]] nce poly128_t reinterpret(bfloat16x8_t a) { return vreinterpretq_p128_bf16(a); }
655[[gnu::always_inline]] nce float32x4_t convert(bfloat16x4_t a) { return vcvt_f32_bf16(a); }
656template <> [[gnu::always_inline]] nce float32x4_t convert_low(bfloat16x8_t a) { return vcvtq_low_f32_bf16(a); }
657template <> [[gnu::always_inline]] nce float32x4_t convert_high(bfloat16x8_t a) { return vcvtq_high_f32_bf16(a); }
658[[gnu::always_inline]] nce bfloat16x8_t convert_high(bfloat16x8_t inactive, float32x4_t a) { return vcvtq_high_bf16_f32(inactive, a); }
659[[gnu::always_inline]] nce bfloat16_t convert(float32_t a) { return vcvth_bf16_f32(a); }
660[[gnu::always_inline]] nce float32_t convert_round_to_nearest_with_ties_away_from_zero(bfloat16_t a) { return vcvtah_f32_bf16(a); }
661// clang-format on
662} // namespace neon
663#undef nce
664#endif // __cplusplus