Argon 0.1.0
Loading...
Searching...
No Matches
vfpv3.hpp
1#pragma once
2#include "arm_simd/shared/vfpv3_int.hpp"
3
4#include "arm_simd/shared/vfpv3_float.hpp"
5
6#ifdef __cplusplus
7#ifdef __clang__
8#define nce constexpr
9#else
10#define nce inline
11#endif
12
13namespace neon {
14
15// clang-format off
16template <typename T> nce T convert(float32x2_t a);
17template <typename T> nce T convert(int32x2_t a);
18template <typename T> nce T convert(uint32x2_t a);
19template <typename T> nce T reinterpret(int8x8_t a);
20template <typename T> nce T reinterpret(int16x4_t a);
21template <typename T> nce T reinterpret(int32x2_t a);
22template <typename T> nce T reinterpret(float32x2_t a);
23template <typename T> nce T reinterpret(uint8x8_t a);
24template <typename T> nce T reinterpret(uint16x4_t a);
25template <typename T> nce T reinterpret(uint32x2_t a);
26template <typename T> nce T reinterpret(poly8x8_t a);
27template <typename T> nce T reinterpret(poly16x4_t a);
28template <typename T> nce T reinterpret(uint64x1_t a);
29template <typename T> nce T reinterpret(int64x1_t a);
30template <typename T> nce T reinterpret(poly8x16_t a);
31template <typename T> nce T reinterpret(poly16x8_t a);
32template <typename T> nce T create(uint64_t a);
33template <typename T> nce T duplicate(int8_t value);
34template <typename T> nce T duplicate(int16_t value);
35template <typename T> nce T duplicate(int32_t value);
36template <typename T> nce T duplicate(int64_t value);
37template <typename T> nce T duplicate(uint8_t value);
38template <typename T> nce T duplicate(uint16_t value);
39template <typename T> nce T duplicate(uint32_t value);
40template <typename T> nce T duplicate(uint64_t value);
41template <typename T> nce T duplicate(float32_t value);
42template <typename T> nce T duplicate(poly8_t value);
43template <typename T> nce T duplicate(poly16_t value);
44template <typename T> nce T load1(int8_t const *ptr);
45template <typename T> nce T load1(int16_t const *ptr);
46template <typename T> nce T load1(int32_t const *ptr);
47template <typename T> nce T load1(int64_t const *ptr);
48template <typename T> nce T load1(uint8_t const *ptr);
49template <typename T> nce T load1(uint16_t const *ptr);
50template <typename T> nce T load1(uint32_t const *ptr);
51template <typename T> nce T load1(uint64_t const *ptr);
52template <typename T> nce T load1(float32_t const *ptr);
53template <typename T> nce T load1(poly8_t const *ptr);
54template <typename T> nce T load1(poly16_t const *ptr);
55template <typename T> nce T load1_duplicate(int8_t const *ptr);
56template <typename T> nce T load1_duplicate(int16_t const *ptr);
57template <typename T> nce T load1_duplicate(int32_t const *ptr);
58template <typename T> nce T load1_duplicate(int64_t const *ptr);
59template <typename T> nce T load1_duplicate(uint8_t const *ptr);
60template <typename T> nce T load1_duplicate(uint16_t const *ptr);
61template <typename T> nce T load1_duplicate(uint32_t const *ptr);
62template <typename T> nce T load1_duplicate(uint64_t const *ptr);
63template <typename T> nce T load1_duplicate(float32_t const *ptr);
64template <typename T> nce T load1_duplicate(poly8_t const *ptr);
65template <typename T> nce T load1_duplicate(poly16_t const *ptr);
66template <typename T> nce T load2(int8_t const *ptr);
67template <typename T> nce T load2(int16_t const *ptr);
68template <typename T> nce T load2(int32_t const *ptr);
69template <typename T> nce T load2(uint8_t const *ptr);
70template <typename T> nce T load2(uint16_t const *ptr);
71template <typename T> nce T load2(uint32_t const *ptr);
72template <typename T> nce T load2(float32_t const *ptr);
73template <typename T> nce T load2(poly8_t const *ptr);
74template <typename T> nce T load2(poly16_t const *ptr);
75template <typename T> nce T load3(int8_t const *ptr);
76template <typename T> nce T load3(int16_t const *ptr);
77template <typename T> nce T load3(int32_t const *ptr);
78template <typename T> nce T load3(uint8_t const *ptr);
79template <typename T> nce T load3(uint16_t const *ptr);
80template <typename T> nce T load3(uint32_t const *ptr);
81template <typename T> nce T load3(float32_t const *ptr);
82template <typename T> nce T load3(poly8_t const *ptr);
83template <typename T> nce T load3(poly16_t const *ptr);
84template <typename T> nce T load4(int8_t const *ptr);
85template <typename T> nce T load4(int16_t const *ptr);
86template <typename T> nce T load4(int32_t const *ptr);
87template <typename T> nce T load4(uint8_t const *ptr);
88template <typename T> nce T load4(uint16_t const *ptr);
89template <typename T> nce T load4(uint32_t const *ptr);
90template <typename T> nce T load4(float32_t const *ptr);
91template <typename T> nce T load4(poly8_t const *ptr);
92template <typename T> nce T load4(poly16_t const *ptr);
93template <typename T> nce T load2_duplicate(int8_t const *ptr);
94template <typename T> nce T load2_duplicate(int16_t const *ptr);
95template <typename T> nce T load2_duplicate(int32_t const *ptr);
96template <typename T> nce T load2_duplicate(uint8_t const *ptr);
97template <typename T> nce T load2_duplicate(uint16_t const *ptr);
98template <typename T> nce T load2_duplicate(uint32_t const *ptr);
99template <typename T> nce T load2_duplicate(float32_t const *ptr);
100template <typename T> nce T load2_duplicate(poly8_t const *ptr);
101template <typename T> nce T load2_duplicate(poly16_t const *ptr);
102template <typename T> nce T load3_duplicate(int8_t const *ptr);
103template <typename T> nce T load3_duplicate(int16_t const *ptr);
104template <typename T> nce T load3_duplicate(int32_t const *ptr);
105template <typename T> nce T load3_duplicate(uint8_t const *ptr);
106template <typename T> nce T load3_duplicate(uint16_t const *ptr);
107template <typename T> nce T load3_duplicate(uint32_t const *ptr);
108template <typename T> nce T load3_duplicate(float32_t const *ptr);
109template <typename T> nce T load3_duplicate(poly8_t const *ptr);
110template <typename T> nce T load3_duplicate(poly16_t const *ptr);
111template <typename T> nce T load4_duplicate(int8_t const *ptr);
112template <typename T> nce T load4_duplicate(int16_t const *ptr);
113template <typename T> nce T load4_duplicate(int32_t const *ptr);
114template <typename T> nce T load4_duplicate(uint8_t const *ptr);
115template <typename T> nce T load4_duplicate(uint16_t const *ptr);
116template <typename T> nce T load4_duplicate(uint32_t const *ptr);
117template <typename T> nce T load4_duplicate(float32_t const *ptr);
118template <typename T> nce T load4_duplicate(poly8_t const *ptr);
119template <typename T> nce T load4_duplicate(poly16_t const *ptr);
120#if defined(__clang__) || (__GNUC__ > 13)
121template <typename T> nce T load1_x2(int8_t const *ptr);
122template <typename T> nce T load1_x2(int16_t const *ptr);
123template <typename T> nce T load1_x2(int32_t const *ptr);
124template <typename T> nce T load1_x2(uint8_t const *ptr);
125template <typename T> nce T load1_x2(uint16_t const *ptr);
126template <typename T> nce T load1_x2(uint32_t const *ptr);
127template <typename T> nce T load1_x2(float32_t const *ptr);
128template <typename T> nce T load1_x2(poly8_t const *ptr);
129template <typename T> nce T load1_x2(poly16_t const *ptr);
130template <typename T> nce T load1_x2(int64_t const *ptr);
131template <typename T> nce T load1_x2(uint64_t const *ptr);
132template <typename T> nce T load1_x3(int8_t const *ptr);
133template <typename T> nce T load1_x3(int16_t const *ptr);
134template <typename T> nce T load1_x3(int32_t const *ptr);
135template <typename T> nce T load1_x3(uint8_t const *ptr);
136template <typename T> nce T load1_x3(uint16_t const *ptr);
137template <typename T> nce T load1_x3(uint32_t const *ptr);
138template <typename T> nce T load1_x3(float32_t const *ptr);
139template <typename T> nce T load1_x3(poly8_t const *ptr);
140template <typename T> nce T load1_x3(poly16_t const *ptr);
141template <typename T> nce T load1_x3(int64_t const *ptr);
142template <typename T> nce T load1_x3(uint64_t const *ptr);
143template <typename T> nce T load1_x4(int8_t const *ptr);
144template <typename T> nce T load1_x4(int16_t const *ptr);
145template <typename T> nce T load1_x4(int32_t const *ptr);
146template <typename T> nce T load1_x4(uint8_t const *ptr);
147template <typename T> nce T load1_x4(uint16_t const *ptr);
148template <typename T> nce T load1_x4(uint32_t const *ptr);
149template <typename T> nce T load1_x4(float32_t const *ptr);
150template <typename T> nce T load1_x4(poly8_t const *ptr);
151template <typename T> nce T load1_x4(poly16_t const *ptr);
152template <typename T> nce T load1_x4(int64_t const *ptr);
153template <typename T> nce T load1_x4(uint64_t const *ptr);
154#endif
155template <typename T> nce T load2(int64_t const *ptr);
156template <typename T> nce T load2(uint64_t const *ptr);
157template <typename T> nce T load3(int64_t const *ptr);
158template <typename T> nce T load3(uint64_t const *ptr);
159template <typename T> nce T load4(int64_t const *ptr);
160template <typename T> nce T load4(uint64_t const *ptr);
161template <typename T> nce T load2_duplicate(int64_t const *ptr);
162template <typename T> nce T load2_duplicate(uint64_t const *ptr);
163template <typename T> nce T load3_duplicate(int64_t const *ptr);
164template <typename T> nce T load3_duplicate(uint64_t const *ptr);
165template <typename T> nce T load4_duplicate(int64_t const *ptr);
166template <typename T> nce T load4_duplicate(uint64_t const *ptr);
167
168inline void store1(int8_t *ptr, int8x8_t val);
169inline void store1(int8_t *ptr, int8x16_t val);
170inline void store1(int16_t *ptr, int16x4_t val);
171inline void store1(int16_t *ptr, int16x8_t val);
172inline void store1(int32_t *ptr, int32x2_t val);
173inline void store1(int32_t *ptr, int32x4_t val);
174inline void store1(int64_t *ptr, int64x1_t val);
175inline void store1(int64_t *ptr, int64x2_t val);
176inline void store1(uint8_t *ptr, uint8x8_t val);
177inline void store1(uint8_t *ptr, uint8x16_t val);
178inline void store1(uint16_t *ptr, uint16x4_t val);
179inline void store1(uint16_t *ptr, uint16x8_t val);
180inline void store1(uint32_t *ptr, uint32x2_t val);
181inline void store1(uint32_t *ptr, uint32x4_t val);
182inline void store1(uint64_t *ptr, uint64x1_t val);
183inline void store1(uint64_t *ptr, uint64x2_t val);
184inline void store1(float32_t *ptr, float32x2_t val);
185inline void store1(float32_t *ptr, float32x4_t val);
186inline void store1(poly8_t *ptr, poly8x8_t val);
187inline void store1(poly8_t *ptr, poly8x16_t val);
188inline void store1(poly16_t *ptr, poly16x4_t val);
189inline void store1(poly16_t *ptr, poly16x8_t val);
190inline void store2(int8_t *ptr, int8x8x2_t val);
191inline void store2(int16_t *ptr, int16x4x2_t val);
192inline void store2(int16_t *ptr, int16x8x2_t val);
193inline void store2(int32_t *ptr, int32x2x2_t val);
194inline void store2(int32_t *ptr, int32x4x2_t val);
195inline void store2(uint8_t *ptr, uint8x8x2_t val);
196inline void store2(uint16_t *ptr, uint16x4x2_t val);
197inline void store2(uint16_t *ptr, uint16x8x2_t val);
198inline void store2(uint32_t *ptr, uint32x2x2_t val);
199inline void store2(uint32_t *ptr, uint32x4x2_t val);
200inline void store2(float32_t *ptr, float32x2x2_t val);
201inline void store2(float32_t *ptr, float32x4x2_t val);
202inline void store2(poly8_t *ptr, poly8x8x2_t val);
203inline void store2(poly16_t *ptr, poly16x4x2_t val);
204inline void store2(poly16_t *ptr, poly16x8x2_t val);
205inline void store3(int8_t *ptr, int8x8x3_t val);
206inline void store3(int8_t *ptr, int8x16x3_t val);
207inline void store3(int16_t *ptr, int16x4x3_t val);
208inline void store3(int16_t *ptr, int16x8x3_t val);
209inline void store3(int32_t *ptr, int32x2x3_t val);
210inline void store3(int32_t *ptr, int32x4x3_t val);
211inline void store3(uint8_t *ptr, uint8x8x3_t val);
212inline void store3(uint8_t *ptr, uint8x16x3_t val);
213inline void store3(uint16_t *ptr, uint16x4x3_t val);
214inline void store3(uint16_t *ptr, uint16x8x3_t val);
215inline void store3(uint32_t *ptr, uint32x2x3_t val);
216inline void store3(uint32_t *ptr, uint32x4x3_t val);
217inline void store3(float32_t *ptr, float32x2x3_t val);
218inline void store3(float32_t *ptr, float32x4x3_t val);
219inline void store3(poly8_t *ptr, poly8x8x3_t val);
220inline void store3(poly8_t *ptr, poly8x16x3_t val);
221inline void store3(poly16_t *ptr, poly16x4x3_t val);
222inline void store3(poly16_t *ptr, poly16x8x3_t val);
223inline void store4(int8_t *ptr, int8x8x4_t val);
224inline void store4(int16_t *ptr, int16x4x4_t val);
225inline void store4(int16_t *ptr, int16x8x4_t val);
226inline void store4(int32_t *ptr, int32x2x4_t val);
227inline void store4(int32_t *ptr, int32x4x4_t val);
228inline void store4(uint8_t *ptr, uint8x8x4_t val);
229inline void store4(uint16_t *ptr, uint16x4x4_t val);
230inline void store4(uint16_t *ptr, uint16x8x4_t val);
231inline void store4(uint32_t *ptr, uint32x2x4_t val);
232inline void store4(uint32_t *ptr, uint32x4x4_t val);
233inline void store4(float32_t *ptr, float32x2x4_t val);
234inline void store4(float32_t *ptr, float32x4x4_t val);
235inline void store4(poly8_t *ptr, poly8x8x4_t val);
236inline void store4(poly16_t *ptr, poly16x4x4_t val);
237inline void store4(poly16_t *ptr, poly16x8x4_t val);
238[[gnu::always_inline]] nce uint8x8_t add(uint8x8_t a, uint8x8_t b) { return vadd_u8(a, b); }
239[[gnu::always_inline]] nce uint16x8_t add_long(uint8x8_t a, uint8x8_t b) { return vaddl_u8(a, b); }
240[[gnu::always_inline]] nce uint8x8_t add_halve(uint8x8_t a, uint8x8_t b) { return vhadd_u8(a, b); }
241[[gnu::always_inline]] nce uint8x8_t add_halve_round(uint8x8_t a, uint8x8_t b) { return vrhadd_u8(a, b); }
242[[gnu::always_inline]] nce uint8x8_t add_saturate(uint8x8_t a, uint8x8_t b) { return vqadd_u8(a, b); }
243[[gnu::always_inline]] nce uint8x8_t multiply(uint8x8_t a, uint8x8_t b) { return vmul_u8(a, b); }
244[[gnu::always_inline]] nce uint8x8_t multiply_add(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vmla_u8(a, b, c); }
245[[gnu::always_inline]] nce uint8x8_t multiply_subtract(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vmls_u8(a, b, c); }
246[[gnu::always_inline]] nce uint16x8_t multiply_long(uint8x8_t a, uint8x8_t b) { return vmull_u8(a, b); }
247[[gnu::always_inline]] nce uint8x8_t subtract(uint8x8_t a, uint8x8_t b) { return vsub_u8(a, b); }
248[[gnu::always_inline]] nce uint16x8_t subtract_long(uint8x8_t a, uint8x8_t b) { return vsubl_u8(a, b); }
249[[gnu::always_inline]] nce uint8x8_t subtract_halve(uint8x8_t a, uint8x8_t b) { return vhsub_u8(a, b); }
250[[gnu::always_inline]] nce uint8x8_t subtract_saturate(uint8x8_t a, uint8x8_t b) { return vqsub_u8(a, b); }
251[[gnu::always_inline]] nce uint8x8_t subtract_absolute(uint8x8_t a, uint8x8_t b) { return vabd_u8(a, b); }
252[[gnu::always_inline]] nce uint16x8_t subtract_absolute_long(uint8x8_t a, uint8x8_t b) { return vabdl_u8(a, b); }
253[[gnu::always_inline]] nce uint8x8_t subtract_absolute_add(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vaba_u8(a, b, c); }
254[[gnu::always_inline]] nce uint8x8_t max(uint8x8_t a, uint8x8_t b) { return vmax_u8(a, b); }
255[[gnu::always_inline]] nce uint8x8_t min(uint8x8_t a, uint8x8_t b) { return vmin_u8(a, b); }
256[[gnu::always_inline]] nce uint8x8_t pairwise_add(uint8x8_t a, uint8x8_t b) { return vpadd_u8(a, b); }
257[[gnu::always_inline]] nce uint16x4_t pairwise_add_long(uint8x8_t a) { return vpaddl_u8(a); }
258[[gnu::always_inline]] nce uint8x8_t pairwise_max(uint8x8_t a, uint8x8_t b) { return vpmax_u8(a, b); }
259[[gnu::always_inline]] nce uint8x8_t pairwise_min(uint8x8_t a, uint8x8_t b) { return vpmin_u8(a, b); }
260[[gnu::always_inline]] nce uint8x8_t equal(uint8x8_t a, uint8x8_t b) { return vceq_u8(a, b); }
261[[gnu::always_inline]] nce uint8x8_t greater_than_or_equal(uint8x8_t a, uint8x8_t b) { return vcge_u8(a, b); }
262[[gnu::always_inline]] nce uint8x8_t less_than_or_equal(uint8x8_t a, uint8x8_t b) { return vcle_u8(a, b); }
263[[gnu::always_inline]] nce uint8x8_t greater_than(uint8x8_t a, uint8x8_t b) { return vcgt_u8(a, b); }
264[[gnu::always_inline]] nce uint8x8_t less_than(uint8x8_t a, uint8x8_t b) { return vclt_u8(a, b); }
265[[gnu::always_inline]] nce uint8x8_t compare_test_nonzero(uint8x8_t a, uint8x8_t b) { return vtst_u8(a, b); }
266[[gnu::always_inline]] nce uint8x8_t shift_left(uint8x8_t a, int8x8_t b) { return vshl_u8(a, b); }
267template <int n>[[gnu::always_inline]] nce uint8x8_t shift_left(uint8x8_t a) { return vshl_n_u8(a, n); }
268[[gnu::always_inline]] nce uint8x8_t shift_left_saturate(uint8x8_t a, int8x8_t b) { return vqshl_u8(a, b); }
269template <int n>[[gnu::always_inline]] nce uint8x8_t shift_left_saturate(uint8x8_t a) { return vqshl_n_u8(a, n); }
270[[gnu::always_inline]] nce uint8x8_t shift_left_round(uint8x8_t a, int8x8_t b) { return vrshl_u8(a, b); }
271[[gnu::always_inline]] nce uint8x8_t shift_left_round_saturate(uint8x8_t a, int8x8_t b) { return vqrshl_u8(a, b); }
272template <int n>[[gnu::always_inline]] nce uint16x8_t shift_left_long(uint8x8_t a) { return vshll_n_u8(a, n); }
273template <int n>[[gnu::always_inline]] nce uint8x8_t shift_left_insert(uint8x8_t a, uint8x8_t b) { return vsli_n_u8(a, b, n); }
274template <int n>[[gnu::always_inline]] nce uint8x8_t shift_right(uint8x8_t a) { return vshr_n_u8(a, n); }
275template <int n>[[gnu::always_inline]] nce uint8x8_t shift_right_round(uint8x8_t a) { return vrshr_n_u8(a, n); }
276template <int n>[[gnu::always_inline]] nce uint8x8_t shift_right_accumulate(uint8x8_t a, uint8x8_t b) { return vsra_n_u8(a, b, n); }
277template <int n>[[gnu::always_inline]] nce uint8x8_t shift_right_accumulate_round(uint8x8_t a, uint8x8_t b) { return vrsra_n_u8(a, b, n); }
278template <int n>[[gnu::always_inline]] nce uint8x8_t shift_right_insert(uint8x8_t a, uint8x8_t b) { return vsri_n_u8(a, b, n); }
279template <> [[gnu::always_inline]] nce int8x8_t reinterpret(uint8x8_t a) { return vreinterpret_s8_u8(a); }
280template <> [[gnu::always_inline]] nce int16x4_t reinterpret(uint8x8_t a) { return vreinterpret_s16_u8(a); }
281template <> [[gnu::always_inline]] nce int32x2_t reinterpret(uint8x8_t a) { return vreinterpret_s32_u8(a); }
282template <> [[gnu::always_inline]] nce float32x2_t reinterpret(uint8x8_t a) { return vreinterpret_f32_u8(a); }
283template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(uint8x8_t a) { return vreinterpret_u16_u8(a); }
284template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(uint8x8_t a) { return vreinterpret_u32_u8(a); }
285template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(uint8x8_t a) { return vreinterpret_p8_u8(a); }
286template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(uint8x8_t a) { return vreinterpret_p16_u8(a); }
287template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(uint8x8_t a) { return vreinterpret_u64_u8(a); }
288template <> [[gnu::always_inline]] nce int64x1_t reinterpret(uint8x8_t a) { return vreinterpret_s64_u8(a); }
289[[gnu::always_inline]] nce uint16x8_t move_long(uint8x8_t a) { return vmovl_u8(a); }
290[[gnu::always_inline]] nce uint8x8_t bitwise_not(uint8x8_t a) { return vmvn_u8(a); }
291[[gnu::always_inline]] nce uint8x8_t bitwise_and(uint8x8_t a, uint8x8_t b) { return vand_u8(a, b); }
292[[gnu::always_inline]] nce uint8x8_t bitwise_or(uint8x8_t a, uint8x8_t b) { return vorr_u8(a, b); }
293[[gnu::always_inline]] nce uint8x8_t bitwise_xor(uint8x8_t a, uint8x8_t b) { return veor_u8(a, b); }
294[[gnu::always_inline]] nce uint8x8_t bitwise_or_not(uint8x8_t a, uint8x8_t b) { return vorn_u8(a, b); }
295#ifdef __clang__
296[[gnu::always_inline]] nce int8x8_t count_leading_sign_bits(uint8x8_t a) { return vcls_u8(a); }
297#endif
298[[gnu::always_inline]] nce uint8x8_t count_leading_zero_bits(uint8x8_t a) { return vclz_u8(a); }
299[[gnu::always_inline]] nce uint8x8_t count_active_bits(uint8x8_t a) { return vcnt_u8(a); }
300[[gnu::always_inline]] nce uint8x8_t bitwise_clear(uint8x8_t a, uint8x8_t b) { return vbic_u8(a, b); }
301[[gnu::always_inline]] nce uint8x8_t bitwise_select(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vbsl_u8(a, b, c); }
302[[gnu::always_inline]] nce int8x8_t bitwise_select(uint8x8_t a, int8x8_t b, int8x8_t c) { return vbsl_s8(a, b, c); }
303[[gnu::always_inline]] nce poly8x8_t bitwise_select(uint8x8_t a, poly8x8_t b, poly8x8_t c) { return vbsl_p8(a, b, c); }
304template <int lane>[[gnu::always_inline]] nce uint8x8_t duplicate_lane(uint8x8_t a) { return vdup_lane_u8(a, lane); }
305template <int lane>[[gnu::always_inline]] nce uint8x16_t duplicate_lane_quad(uint8x8_t a) { return vdupq_lane_u8(a, lane); }
306[[gnu::always_inline]] nce uint8x16_t combine(uint8x8_t low, uint8x8_t high) { return vcombine_u8(low, high); }
307template <int lane>[[gnu::always_inline]] nce uint8_t get_lane(uint8x8_t v) { return vget_lane_u8(v, lane); }
308template <int n>[[gnu::always_inline]] nce uint8x8_t extract(uint8x8_t a, uint8x8_t b) { return vext_u8(a, b, n); }
309[[gnu::always_inline]] nce uint8x8_t reverse_64bit(uint8x8_t a) { return vrev64_u8(a); }
310[[gnu::always_inline]] nce uint8x8_t reverse_32bit(uint8x8_t a) { return vrev32_u8(a); }
311[[gnu::always_inline]] nce uint8x8_t reverse_16bit(uint8x8_t a) { return vrev16_u8(a); }
312[[gnu::always_inline]] nce uint8x8x2_t zip(uint8x8_t a, uint8x8_t b) { return vzip_u8(a, b); }
313[[gnu::always_inline]] nce uint8x8x2_t unzip(uint8x8_t a, uint8x8_t b) { return vuzp_u8(a, b); }
314[[gnu::always_inline]] nce uint8x8x2_t transpose(uint8x8_t a, uint8x8_t b) { return vtrn_u8(a, b); }
315[[gnu::always_inline]] nce uint8x8_t table_lookup1(uint8x8_t a, uint8x8_t idx) { return vtbl1_u8(a, idx); }
316[[gnu::always_inline]] nce uint8x8_t table_extension1(uint8x8_t a, uint8x8_t b, uint8x8_t idx) { return vtbx1_u8(a, b, idx); }
317[[gnu::always_inline]] nce uint8x8_t table_extension2(uint8x8_t a, uint8x8x2_t b, uint8x8_t idx) { return vtbx2_u8(a, b, idx); }
318[[gnu::always_inline]] nce uint8x8_t table_extension3(uint8x8_t a, uint8x8x3_t b, uint8x8_t idx) { return vtbx3_u8(a, b, idx); }
319[[gnu::always_inline]] nce uint8x8_t table_extension4(uint8x8_t a, uint8x8x4_t b, uint8x8_t idx) { return vtbx4_u8(a, b, idx); }
320[[gnu::always_inline]] nce uint8x16_t multiply_add(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vmlaq_u8(a, b, c); }
321[[gnu::always_inline]] nce uint8x16_t multiply_subtract(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vmlsq_u8(a, b, c); }
322[[gnu::always_inline]] nce uint8x16_t subtract_absolute_add(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vabaq_u8(a, b, c); }
323[[gnu::always_inline]] nce uint16x8_t pairwise_add_long(uint8x16_t a) { return vpaddlq_u8(a); }
324[[gnu::always_inline]] nce uint8x16_t equal(uint8x16_t a, uint8x16_t b) { return vceqq_u8(a, b); }
325[[gnu::always_inline]] nce uint8x16_t greater_than_or_equal(uint8x16_t a, uint8x16_t b) { return vcgeq_u8(a, b); }
326[[gnu::always_inline]] nce uint8x16_t less_than_or_equal(uint8x16_t a, uint8x16_t b) { return vcleq_u8(a, b); }
327[[gnu::always_inline]] nce uint8x16_t greater_than(uint8x16_t a, uint8x16_t b) { return vcgtq_u8(a, b); }
328[[gnu::always_inline]] nce uint8x16_t less_than(uint8x16_t a, uint8x16_t b) { return vcltq_u8(a, b); }
329[[gnu::always_inline]] nce uint8x16_t compare_test_nonzero(uint8x16_t a, uint8x16_t b) { return vtstq_u8(a, b); }
330template <int n>[[gnu::always_inline]] nce uint8x16_t shift_left(uint8x16_t a) { return vshlq_n_u8(a, n); }
331template <int n>[[gnu::always_inline]] nce uint8x16_t shift_right_accumulate(uint8x16_t a, uint8x16_t b) { return vsraq_n_u8(a, b, n); }
332template <int n>[[gnu::always_inline]] nce uint8x16_t shift_right_accumulate_round(uint8x16_t a, uint8x16_t b) { return vrsraq_n_u8(a, b, n); }
333template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(uint8x16_t a) { return vreinterpretq_p8_u8(a); }
334template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(uint8x16_t a) { return vreinterpretq_p16_u8(a); }
335#ifdef __clang__
336[[gnu::always_inline]] nce int8x16_t count_leading_sign_bits(uint8x16_t a) { return vclsq_u8(a); }
337#endif
338[[gnu::always_inline]] nce uint8x16_t count_active_bits(uint8x16_t a) { return vcntq_u8(a); }
339[[gnu::always_inline]] nce uint8x16_t bitwise_select(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vbslq_u8(a, b, c); }
340[[gnu::always_inline]] nce uint8x8_t get_high(uint8x16_t a) { return vget_high_u8(a); }
341[[gnu::always_inline]] nce uint8x8_t get_low(uint8x16_t a) { return vget_low_u8(a); }
342template <int n>[[gnu::always_inline]] nce uint8x16_t extract(uint8x16_t a, uint8x16_t b) { return vextq_u8(a, b, n); }
343[[gnu::always_inline]] nce uint8x16x2_t zip(uint8x16_t a, uint8x16_t b) { return vzipq_u8(a, b); }
344[[gnu::always_inline]] nce uint8x16x2_t unzip(uint8x16_t a, uint8x16_t b) { return vuzpq_u8(a, b); }
345[[gnu::always_inline]] nce uint8x16x2_t transpose(uint8x16_t a, uint8x16_t b) { return vtrnq_u8(a, b); }
346[[gnu::always_inline]] nce int8x16_t bitwise_select(uint8x16_t a, int8x16_t b, int8x16_t c) { return vbslq_s8(a, b, c); }
347[[gnu::always_inline]] nce poly8x16_t bitwise_select(uint8x16_t a, poly8x16_t b, poly8x16_t c) { return vbslq_p8(a, b, c); }
348[[gnu::always_inline]] nce int8x8_t add(int8x8_t a, int8x8_t b) { return vadd_s8(a, b); }
349[[gnu::always_inline]] nce int16x8_t add_long(int8x8_t a, int8x8_t b) { return vaddl_s8(a, b); }
350[[gnu::always_inline]] nce int8x8_t add_halve(int8x8_t a, int8x8_t b) { return vhadd_s8(a, b); }
351[[gnu::always_inline]] nce int8x8_t add_halve_round(int8x8_t a, int8x8_t b) { return vrhadd_s8(a, b); }
352[[gnu::always_inline]] nce int8x8_t add_saturate(int8x8_t a, int8x8_t b) { return vqadd_s8(a, b); }
353[[gnu::always_inline]] nce int8x8_t multiply(int8x8_t a, int8x8_t b) { return vmul_s8(a, b); }
354[[gnu::always_inline]] nce int8x8_t multiply_add(int8x8_t a, int8x8_t b, int8x8_t c) { return vmla_s8(a, b, c); }
355[[gnu::always_inline]] nce int8x8_t multiply_subtract(int8x8_t a, int8x8_t b, int8x8_t c) { return vmls_s8(a, b, c); }
356[[gnu::always_inline]] nce int16x8_t multiply_long(int8x8_t a, int8x8_t b) { return vmull_s8(a, b); }
357[[gnu::always_inline]] nce int8x8_t subtract(int8x8_t a, int8x8_t b) { return vsub_s8(a, b); }
358[[gnu::always_inline]] nce int16x8_t subtract_long(int8x8_t a, int8x8_t b) { return vsubl_s8(a, b); }
359[[gnu::always_inline]] nce int8x8_t subtract_halve(int8x8_t a, int8x8_t b) { return vhsub_s8(a, b); }
360[[gnu::always_inline]] nce int8x8_t subtract_saturate(int8x8_t a, int8x8_t b) { return vqsub_s8(a, b); }
361[[gnu::always_inline]] nce int8x8_t subtract_absolute(int8x8_t a, int8x8_t b) { return vabd_s8(a, b); }
362[[gnu::always_inline]] nce int16x8_t subtract_absolute_long(int8x8_t a, int8x8_t b) { return vabdl_s8(a, b); }
363[[gnu::always_inline]] nce int8x8_t subtract_absolute_add(int8x8_t a, int8x8_t b, int8x8_t c) { return vaba_s8(a, b, c); }
364[[gnu::always_inline]] nce int8x8_t absolute(int8x8_t a) { return vabs_s8(a); }
365[[gnu::always_inline]] nce int8x8_t absolute_saturate(int8x8_t a) { return vqabs_s8(a); }
366[[gnu::always_inline]] nce int8x8_t max(int8x8_t a, int8x8_t b) { return vmax_s8(a, b); }
367[[gnu::always_inline]] nce int8x8_t min(int8x8_t a, int8x8_t b) { return vmin_s8(a, b); }
368[[gnu::always_inline]] nce int8x8_t pairwise_add(int8x8_t a, int8x8_t b) { return vpadd_s8(a, b); }
369[[gnu::always_inline]] nce int16x4_t pairwise_add_long(int8x8_t a) { return vpaddl_s8(a); }
370[[gnu::always_inline]] nce int8x8_t pairwise_max(int8x8_t a, int8x8_t b) { return vpmax_s8(a, b); }
371[[gnu::always_inline]] nce int8x8_t pairwise_min(int8x8_t a, int8x8_t b) { return vpmin_s8(a, b); }
372[[gnu::always_inline]] nce uint8x8_t equal(int8x8_t a, int8x8_t b) { return vceq_s8(a, b); }
373[[gnu::always_inline]] nce uint8x8_t greater_than_or_equal(int8x8_t a, int8x8_t b) { return vcge_s8(a, b); }
374[[gnu::always_inline]] nce uint8x8_t less_than_or_equal(int8x8_t a, int8x8_t b) { return vcle_s8(a, b); }
375[[gnu::always_inline]] nce uint8x8_t greater_than(int8x8_t a, int8x8_t b) { return vcgt_s8(a, b); }
376[[gnu::always_inline]] nce uint8x8_t less_than(int8x8_t a, int8x8_t b) { return vclt_s8(a, b); }
377[[gnu::always_inline]] nce uint8x8_t compare_test_nonzero(int8x8_t a, int8x8_t b) { return vtst_s8(a, b); }
378[[gnu::always_inline]] nce int8x8_t shift_left(int8x8_t a, int8x8_t b) { return vshl_s8(a, b); }
379template <int n>[[gnu::always_inline]] nce int8x8_t shift_left(int8x8_t a) { return vshl_n_s8(a, n); }
380[[gnu::always_inline]] nce int8x8_t shift_left_saturate(int8x8_t a, int8x8_t b) { return vqshl_s8(a, b); }
381template <int n>[[gnu::always_inline]] nce int8x8_t shift_left_saturate(int8x8_t a) { return vqshl_n_s8(a, n); }
382template <int n>[[gnu::always_inline]] nce uint8x8_t shift_left_unsigned_saturate(int8x8_t a) { return vqshlu_n_s8(a, n); }
383[[gnu::always_inline]] nce int8x8_t shift_left_round(int8x8_t a, int8x8_t b) { return vrshl_s8(a, b); }
384[[gnu::always_inline]] nce int8x8_t shift_left_round_saturate(int8x8_t a, int8x8_t b) { return vqrshl_s8(a, b); }
385template <int n>[[gnu::always_inline]] nce int16x8_t shift_left_long(int8x8_t a) { return vshll_n_s8(a, n); }
386template <int n>[[gnu::always_inline]] nce int8x8_t shift_left_insert(int8x8_t a, int8x8_t b) { return vsli_n_s8(a, b, n); }
387template <int n>[[gnu::always_inline]] nce int8x8_t shift_right(int8x8_t a) { return vshr_n_s8(a, n); }
388template <int n>[[gnu::always_inline]] nce int8x8_t shift_right_round(int8x8_t a) { return vrshr_n_s8(a, n); }
389template <int n>[[gnu::always_inline]] nce int8x8_t shift_right_accumulate(int8x8_t a, int8x8_t b) { return vsra_n_s8(a, b, n); }
390template <int n>[[gnu::always_inline]] nce int8x8_t shift_right_accumulate_round(int8x8_t a, int8x8_t b) { return vrsra_n_s8(a, b, n); }
391template <int n>[[gnu::always_inline]] nce int8x8_t shift_right_insert(int8x8_t a, int8x8_t b) { return vsri_n_s8(a, b, n); }
392template <> [[gnu::always_inline]] nce int16x4_t reinterpret(int8x8_t a) { return vreinterpret_s16_s8(a); }
393template <> [[gnu::always_inline]] nce int32x2_t reinterpret(int8x8_t a) { return vreinterpret_s32_s8(a); }
394template <> [[gnu::always_inline]] nce float32x2_t reinterpret(int8x8_t a) { return vreinterpret_f32_s8(a); }
395template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(int8x8_t a) { return vreinterpret_u8_s8(a); }
396template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(int8x8_t a) { return vreinterpret_u16_s8(a); }
397template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(int8x8_t a) { return vreinterpret_u32_s8(a); }
398template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(int8x8_t a) { return vreinterpret_p8_s8(a); }
399template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(int8x8_t a) { return vreinterpret_p16_s8(a); }
400template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(int8x8_t a) { return vreinterpret_u64_s8(a); }
401template <> [[gnu::always_inline]] nce int64x1_t reinterpret(int8x8_t a) { return vreinterpret_s64_s8(a); }
402[[gnu::always_inline]] nce int16x8_t move_long(int8x8_t a) { return vmovl_s8(a); }
403[[gnu::always_inline]] nce int8x8_t negate(int8x8_t a) { return vneg_s8(a); }
404[[gnu::always_inline]] nce int8x8_t negate_saturate(int8x8_t a) { return vqneg_s8(a); }
405[[gnu::always_inline]] nce int8x8_t bitwise_not(int8x8_t a) { return vmvn_s8(a); }
406[[gnu::always_inline]] nce int8x8_t bitwise_and(int8x8_t a, int8x8_t b) { return vand_s8(a, b); }
407[[gnu::always_inline]] nce int8x8_t bitwise_or(int8x8_t a, int8x8_t b) { return vorr_s8(a, b); }
408[[gnu::always_inline]] nce int8x8_t bitwise_xor(int8x8_t a, int8x8_t b) { return veor_s8(a, b); }
409[[gnu::always_inline]] nce int8x8_t bitwise_or_not(int8x8_t a, int8x8_t b) { return vorn_s8(a, b); }
410[[gnu::always_inline]] nce int8x8_t count_leading_sign_bits(int8x8_t a) { return vcls_s8(a); }
411[[gnu::always_inline]] nce int8x8_t count_leading_zero_bits(int8x8_t a) { return vclz_s8(a); }
412[[gnu::always_inline]] nce int8x8_t count_active_bits(int8x8_t a) { return vcnt_s8(a); }
413[[gnu::always_inline]] nce int8x8_t bitwise_clear(int8x8_t a, int8x8_t b) { return vbic_s8(a, b); }
414template <int lane>[[gnu::always_inline]] nce int8x8_t duplicate_lane(int8x8_t a) { return vdup_lane_s8(a, lane); }
415template <int lane>[[gnu::always_inline]] nce int8x16_t duplicate_lane_quad(int8x8_t a) { return vdupq_lane_s8(a, lane); }
416[[gnu::always_inline]] nce int8x16_t combine(int8x8_t low, int8x8_t high) { return vcombine_s8(low, high); }
417template <int lane>[[gnu::always_inline]] nce int8_t get_lane(int8x8_t v) { return vget_lane_s8(v, lane); }
418template <int n>[[gnu::always_inline]] nce int8x8_t extract(int8x8_t a, int8x8_t b) { return vext_s8(a, b, n); }
419[[gnu::always_inline]] nce int8x8_t reverse_64bit(int8x8_t a) { return vrev64_s8(a); }
420[[gnu::always_inline]] nce int8x8_t reverse_32bit(int8x8_t a) { return vrev32_s8(a); }
421[[gnu::always_inline]] nce int8x8_t reverse_16bit(int8x8_t a) { return vrev16_s8(a); }
422[[gnu::always_inline]] nce int8x8x2_t zip(int8x8_t a, int8x8_t b) { return vzip_s8(a, b); }
423[[gnu::always_inline]] nce int8x8x2_t unzip(int8x8_t a, int8x8_t b) { return vuzp_s8(a, b); }
424[[gnu::always_inline]] nce int8x8x2_t transpose(int8x8_t a, int8x8_t b) { return vtrn_s8(a, b); }
425[[gnu::always_inline]] nce int8x8_t table_lookup1(int8x8_t a, int8x8_t idx) { return vtbl1_s8(a, idx); }
426[[gnu::always_inline]] nce int8x8_t table_extension1(int8x8_t a, int8x8_t b, int8x8_t idx) { return vtbx1_s8(a, b, idx); }
427[[gnu::always_inline]] nce int8x8_t table_extension2(int8x8_t a, int8x8x2_t b, int8x8_t idx) { return vtbx2_s8(a, b, idx); }
428[[gnu::always_inline]] nce int8x8_t table_extension3(int8x8_t a, int8x8x3_t b, int8x8_t idx) { return vtbx3_s8(a, b, idx); }
429[[gnu::always_inline]] nce int8x8_t table_extension4(int8x8_t a, int8x8x4_t b, int8x8_t idx) { return vtbx4_s8(a, b, idx); }
430[[gnu::always_inline]] nce int8x16_t multiply_add(int8x16_t a, int8x16_t b, int8x16_t c) { return vmlaq_s8(a, b, c); }
431[[gnu::always_inline]] nce int8x16_t multiply_subtract(int8x16_t a, int8x16_t b, int8x16_t c) { return vmlsq_s8(a, b, c); }
432[[gnu::always_inline]] nce int8x16_t subtract_absolute_add(int8x16_t a, int8x16_t b, int8x16_t c) { return vabaq_s8(a, b, c); }
433[[gnu::always_inline]] nce int16x8_t pairwise_add_long(int8x16_t a) { return vpaddlq_s8(a); }
434[[gnu::always_inline]] nce uint8x16_t equal(int8x16_t a, int8x16_t b) { return vceqq_s8(a, b); }
435[[gnu::always_inline]] nce uint8x16_t greater_than_or_equal(int8x16_t a, int8x16_t b) { return vcgeq_s8(a, b); }
436[[gnu::always_inline]] nce uint8x16_t less_than_or_equal(int8x16_t a, int8x16_t b) { return vcleq_s8(a, b); }
437[[gnu::always_inline]] nce uint8x16_t greater_than(int8x16_t a, int8x16_t b) { return vcgtq_s8(a, b); }
438[[gnu::always_inline]] nce uint8x16_t less_than(int8x16_t a, int8x16_t b) { return vcltq_s8(a, b); }
439[[gnu::always_inline]] nce uint8x16_t compare_test_nonzero(int8x16_t a, int8x16_t b) { return vtstq_s8(a, b); }
440template <int n>[[gnu::always_inline]] nce int8x16_t shift_left(int8x16_t a) { return vshlq_n_s8(a, n); }
441template <int n>[[gnu::always_inline]] nce uint8x16_t shift_left_unsigned_saturate(int8x16_t a) { return vqshluq_n_s8(a, n); }
442template <int n>[[gnu::always_inline]] nce int8x16_t shift_right_accumulate(int8x16_t a, int8x16_t b) { return vsraq_n_s8(a, b, n); }
443template <int n>[[gnu::always_inline]] nce int8x16_t shift_right_accumulate_round(int8x16_t a, int8x16_t b) { return vrsraq_n_s8(a, b, n); }
444template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(int8x16_t a) { return vreinterpretq_p8_s8(a); }
445template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(int8x16_t a) { return vreinterpretq_p16_s8(a); }
446[[gnu::always_inline]] nce int8x16_t count_active_bits(int8x16_t a) { return vcntq_s8(a); }
447[[gnu::always_inline]] nce int8x8_t get_high(int8x16_t a) { return vget_high_s8(a); }
448[[gnu::always_inline]] nce int8x8_t get_low(int8x16_t a) { return vget_low_s8(a); }
449template <int n>[[gnu::always_inline]] nce int8x16_t extract(int8x16_t a, int8x16_t b) { return vextq_s8(a, b, n); }
450[[gnu::always_inline]] nce int8x16x2_t zip(int8x16_t a, int8x16_t b) { return vzipq_s8(a, b); }
451[[gnu::always_inline]] nce int8x16x2_t unzip(int8x16_t a, int8x16_t b) { return vuzpq_s8(a, b); }
452[[gnu::always_inline]] nce int8x16x2_t transpose(int8x16_t a, int8x16_t b) { return vtrnq_s8(a, b); }
453[[gnu::always_inline]] nce uint16x4_t add(uint16x4_t a, uint16x4_t b) { return vadd_u16(a, b); }
454[[gnu::always_inline]] nce uint32x4_t add_long(uint16x4_t a, uint16x4_t b) { return vaddl_u16(a, b); }
455[[gnu::always_inline]] nce uint16x4_t add_halve(uint16x4_t a, uint16x4_t b) { return vhadd_u16(a, b); }
456[[gnu::always_inline]] nce uint16x4_t add_halve_round(uint16x4_t a, uint16x4_t b) { return vrhadd_u16(a, b); }
457[[gnu::always_inline]] nce uint16x4_t add_saturate(uint16x4_t a, uint16x4_t b) { return vqadd_u16(a, b); }
458[[gnu::always_inline]] nce uint16x4_t multiply(uint16x4_t a, uint16x4_t b) { return vmul_u16(a, b); }
459[[gnu::always_inline]] nce uint16x4_t multiply_add(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmla_u16(a, b, c); }
460[[gnu::always_inline]] nce uint16x4_t multiply_subtract(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmls_u16(a, b, c); }
461[[gnu::always_inline]] nce uint32x4_t multiply_long(uint16x4_t a, uint16x4_t b) { return vmull_u16(a, b); }
462[[gnu::always_inline]] nce uint16x4_t subtract(uint16x4_t a, uint16x4_t b) { return vsub_u16(a, b); }
463[[gnu::always_inline]] nce uint32x4_t subtract_long(uint16x4_t a, uint16x4_t b) { return vsubl_u16(a, b); }
464[[gnu::always_inline]] nce uint16x4_t subtract_halve(uint16x4_t a, uint16x4_t b) { return vhsub_u16(a, b); }
465[[gnu::always_inline]] nce uint16x4_t subtract_saturate(uint16x4_t a, uint16x4_t b) { return vqsub_u16(a, b); }
466[[gnu::always_inline]] nce uint16x4_t subtract_absolute(uint16x4_t a, uint16x4_t b) { return vabd_u16(a, b); }
467[[gnu::always_inline]] nce uint32x4_t subtract_absolute_long(uint16x4_t a, uint16x4_t b) { return vabdl_u16(a, b); }
468[[gnu::always_inline]] nce uint16x4_t subtract_absolute_add(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vaba_u16(a, b, c); }
469[[gnu::always_inline]] nce uint16x4_t max(uint16x4_t a, uint16x4_t b) { return vmax_u16(a, b); }
470[[gnu::always_inline]] nce uint16x4_t min(uint16x4_t a, uint16x4_t b) { return vmin_u16(a, b); }
471[[gnu::always_inline]] nce uint16x4_t pairwise_add(uint16x4_t a, uint16x4_t b) { return vpadd_u16(a, b); }
472[[gnu::always_inline]] nce uint32x2_t pairwise_add_long(uint16x4_t a) { return vpaddl_u16(a); }
473[[gnu::always_inline]] nce uint16x4_t pairwise_add_accumulate_long(uint16x4_t a, uint8x8_t b) { return vpadal_u8(a, b); }
474[[gnu::always_inline]] nce uint16x4_t pairwise_max(uint16x4_t a, uint16x4_t b) { return vpmax_u16(a, b); }
475[[gnu::always_inline]] nce uint16x4_t pairwise_min(uint16x4_t a, uint16x4_t b) { return vpmin_u16(a, b); }
476[[gnu::always_inline]] nce uint16x4_t equal(uint16x4_t a, uint16x4_t b) { return vceq_u16(a, b); }
477[[gnu::always_inline]] nce uint16x4_t greater_than_or_equal(uint16x4_t a, uint16x4_t b) { return vcge_u16(a, b); }
478[[gnu::always_inline]] nce uint16x4_t less_than_or_equal(uint16x4_t a, uint16x4_t b) { return vcle_u16(a, b); }
479[[gnu::always_inline]] nce uint16x4_t greater_than(uint16x4_t a, uint16x4_t b) { return vcgt_u16(a, b); }
480[[gnu::always_inline]] nce uint16x4_t less_than(uint16x4_t a, uint16x4_t b) { return vclt_u16(a, b); }
481[[gnu::always_inline]] nce uint16x4_t compare_test_nonzero(uint16x4_t a, uint16x4_t b) { return vtst_u16(a, b); }
482[[gnu::always_inline]] nce uint16x4_t shift_left(uint16x4_t a, int16x4_t b) { return vshl_u16(a, b); }
483template <int n>[[gnu::always_inline]] nce uint16x4_t shift_left(uint16x4_t a) { return vshl_n_u16(a, n); }
484[[gnu::always_inline]] nce uint16x4_t shift_left_saturate(uint16x4_t a, int16x4_t b) { return vqshl_u16(a, b); }
485template <int n>[[gnu::always_inline]] nce uint16x4_t shift_left_saturate(uint16x4_t a) { return vqshl_n_u16(a, n); }
486template <int n>[[gnu::always_inline]] nce uint32x4_t shift_left_long(uint16x4_t a) { return vshll_n_u16(a, n); }
487template <int n>[[gnu::always_inline]] nce uint16x4_t shift_left_insert(uint16x4_t a, uint16x4_t b) { return vsli_n_u16(a, b, n); }
488[[gnu::always_inline]] nce uint16x4_t shift_left_round(uint16x4_t a, int16x4_t b) { return vrshl_u16(a, b); }
489[[gnu::always_inline]] nce uint16x4_t shift_left_round_saturate(uint16x4_t a, int16x4_t b) { return vqrshl_u16(a, b); }
490template <int n>[[gnu::always_inline]] nce uint16x4_t shift_right(uint16x4_t a) { return vshr_n_u16(a, n); }
491template <int n>[[gnu::always_inline]] nce uint16x4_t shift_right_round(uint16x4_t a) { return vrshr_n_u16(a, n); }
492template <int n>[[gnu::always_inline]] nce uint16x4_t shift_right_accumulate(uint16x4_t a, uint16x4_t b) { return vsra_n_u16(a, b, n); }
493template <int n>[[gnu::always_inline]] nce uint16x4_t shift_right_accumulate_round(uint16x4_t a, uint16x4_t b) { return vrsra_n_u16(a, b, n); }
494template <int n>[[gnu::always_inline]] nce uint16x4_t shift_right_insert(uint16x4_t a, uint16x4_t b) { return vsri_n_u16(a, b, n); }
495template <> [[gnu::always_inline]] nce int8x8_t reinterpret(uint16x4_t a) { return vreinterpret_s8_u16(a); }
496template <> [[gnu::always_inline]] nce int16x4_t reinterpret(uint16x4_t a) { return vreinterpret_s16_u16(a); }
497template <> [[gnu::always_inline]] nce int32x2_t reinterpret(uint16x4_t a) { return vreinterpret_s32_u16(a); }
498template <> [[gnu::always_inline]] nce float32x2_t reinterpret(uint16x4_t a) { return vreinterpret_f32_u16(a); }
499template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(uint16x4_t a) { return vreinterpret_u8_u16(a); }
500template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(uint16x4_t a) { return vreinterpret_u32_u16(a); }
501template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(uint16x4_t a) { return vreinterpret_p8_u16(a); }
502template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(uint16x4_t a) { return vreinterpret_p16_u16(a); }
503template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(uint16x4_t a) { return vreinterpret_u64_u16(a); }
504template <> [[gnu::always_inline]] nce int64x1_t reinterpret(uint16x4_t a) { return vreinterpret_s64_u16(a); }
505[[gnu::always_inline]] nce uint32x4_t move_long(uint16x4_t a) { return vmovl_u16(a); }
506template <int lane> [[gnu::always_inline]] nce uint16x4_t multiply_add_lane(uint16x4_t a, uint16x4_t b, uint16x4_t v) { return vmla_lane_u16(a, b, v, lane); }
507template <int lane> [[gnu::always_inline]] nce uint16x4_t multiply_subtract_lane(uint16x4_t a, uint16x4_t b, uint16x4_t v) { return vmls_lane_u16(a, b, v, lane); }
508[[gnu::always_inline]] nce uint16x4_t multiply_add(uint16x4_t a, uint16x4_t b, uint16_t c) { return vmla_n_u16(a, b, c); }
509template <int lane> [[gnu::always_inline]] nce uint16x4_t multiply_lane(uint16x4_t a, uint16x4_t v) { return vmul_lane_u16(a, v, lane); }
510template <int lane> [[gnu::always_inline]] nce uint32x4_t multiply_long_lane(uint16x4_t a, uint16x4_t v) { return vmull_lane_u16(a, v, lane); }
511[[gnu::always_inline]] nce uint16x4_t multiply_subtract(uint16x4_t a, uint16x4_t b, uint16_t c) { return vmls_n_u16(a, b, c); }
512[[gnu::always_inline]] nce uint16x4_t bitwise_not(uint16x4_t a) { return vmvn_u16(a); }
513[[gnu::always_inline]] nce uint16x4_t bitwise_and(uint16x4_t a, uint16x4_t b) { return vand_u16(a, b); }
514[[gnu::always_inline]] nce uint16x4_t bitwise_or(uint16x4_t a, uint16x4_t b) { return vorr_u16(a, b); }
515[[gnu::always_inline]] nce uint16x4_t bitwise_xor(uint16x4_t a, uint16x4_t b) { return veor_u16(a, b); }
516[[gnu::always_inline]] nce uint16x4_t bitwise_or_not(uint16x4_t a, uint16x4_t b) { return vorn_u16(a, b); }
517#ifdef __clang__
518[[gnu::always_inline]] nce int16x4_t count_leading_sign_bits(uint16x4_t a) { return vcls_u16(a); }
519#endif
520[[gnu::always_inline]] nce uint16x4_t count_leading_zero_bits(uint16x4_t a) { return vclz_u16(a); }
521[[gnu::always_inline]] nce uint16x4_t bitwise_clear(uint16x4_t a, uint16x4_t b) { return vbic_u16(a, b); }
522[[gnu::always_inline]] nce uint16x4_t bitwise_select(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vbsl_u16(a, b, c); }
523template <int lane>[[gnu::always_inline]] nce uint16x4_t duplicate_lane(uint16x4_t a) { return vdup_lane_u16(a, lane); }
524template <int lane>[[gnu::always_inline]] nce uint16x8_t duplicate_lane_quad(uint16x4_t a) { return vdupq_lane_u16(a, lane); }
525[[gnu::always_inline]] nce uint16x8_t combine(uint16x4_t low, uint16x4_t high) { return vcombine_u16(low, high); }
526template <int lane>[[gnu::always_inline]] nce uint16_t get_lane(uint16x4_t v) { return vget_lane_u16(v, lane); }
527template <int n>[[gnu::always_inline]] nce uint16x4_t extract(uint16x4_t a, uint16x4_t b) { return vext_u16(a, b, n); }
528[[gnu::always_inline]] nce uint16x4_t reverse_64bit(uint16x4_t a) { return vrev64_u16(a); }
529[[gnu::always_inline]] nce uint16x4_t reverse_32bit(uint16x4_t a) { return vrev32_u16(a); }
530[[gnu::always_inline]] nce uint16x4x2_t zip(uint16x4_t a, uint16x4_t b) { return vzip_u16(a, b); }
531[[gnu::always_inline]] nce uint16x4x2_t unzip(uint16x4_t a, uint16x4_t b) { return vuzp_u16(a, b); }
532[[gnu::always_inline]] nce uint16x4x2_t transpose(uint16x4_t a, uint16x4_t b) { return vtrn_u16(a, b); }
533[[gnu::always_inline]] nce int16x4_t bitwise_select(uint16x4_t a, int16x4_t b, int16x4_t c) { return vbsl_s16(a, b, c); }
534[[gnu::always_inline]] nce poly16x4_t bitwise_select(uint16x4_t a, poly16x4_t b, poly16x4_t c) { return vbsl_p16(a, b, c); }
535[[gnu::always_inline]] nce uint16x4_t multiply(uint16x4_t a, uint16_t b) { return vmul_n_u16(a, b); }
536[[gnu::always_inline]] nce uint32x4_t multiply_long(uint16x4_t a, uint16_t b) { return vmull_n_u16(a, b); }
537[[gnu::always_inline]] nce uint16x8_t add(uint16x8_t a, uint8x8_t b) { return vaddw_u8(a, b); }
538[[gnu::always_inline]] nce uint16x8_t multiply_add_long(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vmlal_u8(a, b, c); }
539[[gnu::always_inline]] nce uint16x8_t multiply_subtract_long(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vmlsl_u8(a, b, c); }
540[[gnu::always_inline]] nce uint16x8_t subtract(uint16x8_t a, uint8x8_t b) { return vsubw_u8(a, b); }
541[[gnu::always_inline]] nce uint16x8_t subtract_absolute_add(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vabal_u8(a, b, c); }
542[[gnu::always_inline]] nce uint8x8_t add_narrow(uint16x8_t a, uint16x8_t b) { return vaddhn_u16(a, b); }
543[[gnu::always_inline]] nce uint8x8_t add_round_narrow(uint16x8_t a, uint16x8_t b) { return vraddhn_u16(a, b); }
544[[gnu::always_inline]] nce uint16x8_t multiply_add(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vmlaq_u16(a, b, c); }
545[[gnu::always_inline]] nce uint16x8_t multiply_subtract(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vmlsq_u16(a, b, c); }
546[[gnu::always_inline]] nce uint8x8_t subtract_narrow(uint16x8_t a, uint16x8_t b) { return vsubhn_u16(a, b); }
547[[gnu::always_inline]] nce uint8x8_t subtract_round_narrow(uint16x8_t a, uint16x8_t b) { return vrsubhn_u16(a, b); }
548[[gnu::always_inline]] nce uint16x8_t subtract_absolute_add(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vabaq_u16(a, b, c); }
549[[gnu::always_inline]] nce uint32x4_t pairwise_add_long(uint16x8_t a) { return vpaddlq_u16(a); }
550[[gnu::always_inline]] nce uint16x8_t pairwise_add_accumulate_long(uint16x8_t a, uint8x16_t b) { return vpadalq_u8(a, b); }
551[[gnu::always_inline]] nce uint16x8_t equal(uint16x8_t a, uint16x8_t b) { return vceqq_u16(a, b); }
552[[gnu::always_inline]] nce uint16x8_t greater_than_or_equal(uint16x8_t a, uint16x8_t b) { return vcgeq_u16(a, b); }
553[[gnu::always_inline]] nce uint16x8_t less_than_or_equal(uint16x8_t a, uint16x8_t b) { return vcleq_u16(a, b); }
554[[gnu::always_inline]] nce uint16x8_t greater_than(uint16x8_t a, uint16x8_t b) { return vcgtq_u16(a, b); }
555[[gnu::always_inline]] nce uint16x8_t less_than(uint16x8_t a, uint16x8_t b) { return vcltq_u16(a, b); }
556[[gnu::always_inline]] nce uint16x8_t compare_test_nonzero(uint16x8_t a, uint16x8_t b) { return vtstq_u16(a, b); }
557template <int n>[[gnu::always_inline]] nce uint16x8_t shift_right_accumulate(uint16x8_t a, uint16x8_t b) { return vsraq_n_u16(a, b, n); }
558template <int n>[[gnu::always_inline]] nce uint16x8_t shift_right_accumulate_round(uint16x8_t a, uint16x8_t b) { return vrsraq_n_u16(a, b, n); }
559template <int n>[[gnu::always_inline]] nce uint8x8_t shift_right_narrow(uint16x8_t a) { return vshrn_n_u16(a, n); }
560template <int n>[[gnu::always_inline]] nce uint8x8_t shift_right_saturate_narrow(uint16x8_t a) { return vqshrn_n_u16(a, n); }
561template <int n>[[gnu::always_inline]] nce uint8x8_t shift_right_round_saturate_narrow(uint16x8_t a) { return vqrshrn_n_u16(a, n); }
562template <int n>[[gnu::always_inline]] nce uint8x8_t shift_right_round_narrow(uint16x8_t a) { return vrshrn_n_u16(a, n); }
563template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(uint16x8_t a) { return vreinterpretq_p8_u16(a); }
564template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(uint16x8_t a) { return vreinterpretq_p16_u16(a); }
565[[gnu::always_inline]] nce uint8x8_t move_narrow(uint16x8_t a) { return vmovn_u16(a); }
566[[gnu::always_inline]] nce uint8x8_t move_saturate_narrow(uint16x8_t a) { return vqmovn_u16(a); }
567template <int lane> [[gnu::always_inline]] nce uint16x8_t multiply_add_lane(uint16x8_t a, uint16x8_t b, uint16x4_t v) { return vmlaq_lane_u16(a, b, v, lane); }
568template <int n>[[gnu::always_inline]] nce uint16x8_t shift_left(uint16x8_t a) { return vshlq_n_u16(a, n); }
569template <int lane> [[gnu::always_inline]] nce uint16x8_t multiply_lane(uint16x8_t a, uint16x4_t v) { return vmulq_lane_u16(a, v, lane); }
570template <int lane> [[gnu::always_inline]] nce uint16x8_t multiply_subtract_lane(uint16x8_t a, uint16x8_t b, uint16x4_t v) { return vmlsq_lane_u16(a, b, v, lane); }
571[[gnu::always_inline]] nce uint16x8_t multiply_add(uint16x8_t a, uint16x8_t b, uint16_t c) { return vmlaq_n_u16(a, b, c); }
572[[gnu::always_inline]] nce uint16x8_t multiply_subtract(uint16x8_t a, uint16x8_t b, uint16_t c) { return vmlsq_n_u16(a, b, c); }
573#ifdef __clang__
574[[gnu::always_inline]] nce int16x8_t count_leading_sign_bits(uint16x8_t a) { return vclsq_u16(a); }
575#endif
576[[gnu::always_inline]] nce uint16x8_t bitwise_select(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vbslq_u16(a, b, c); }
577[[gnu::always_inline]] nce uint16x4_t get_high(uint16x8_t a) { return vget_high_u16(a); }
578[[gnu::always_inline]] nce uint16x4_t get_low(uint16x8_t a) { return vget_low_u16(a); }
579template <int n>[[gnu::always_inline]] nce uint16x8_t extract(uint16x8_t a, uint16x8_t b) { return vextq_u16(a, b, n); }
580[[gnu::always_inline]] nce uint16x8x2_t zip(uint16x8_t a, uint16x8_t b) { return vzipq_u16(a, b); }
581[[gnu::always_inline]] nce uint16x8x2_t unzip(uint16x8_t a, uint16x8_t b) { return vuzpq_u16(a, b); }
582[[gnu::always_inline]] nce uint16x8x2_t transpose(uint16x8_t a, uint16x8_t b) { return vtrnq_u16(a, b); }
583[[gnu::always_inline]] nce int16x8_t bitwise_select(uint16x8_t a, int16x8_t b, int16x8_t c) { return vbslq_s16(a, b, c); }
584[[gnu::always_inline]] nce poly16x8_t bitwise_select(uint16x8_t a, poly16x8_t b, poly16x8_t c) { return vbslq_p16(a, b, c); }
585[[gnu::always_inline]] nce int16x4_t pairwise_add_accumulate_long(int16x4_t a, int8x8_t b) { return vpadal_s8(a, b); }
586[[gnu::always_inline]] nce int16x4_t add(int16x4_t a, int16x4_t b) { return vadd_s16(a, b); }
587[[gnu::always_inline]] nce int32x4_t add_long(int16x4_t a, int16x4_t b) { return vaddl_s16(a, b); }
588[[gnu::always_inline]] nce int16x4_t add_halve(int16x4_t a, int16x4_t b) { return vhadd_s16(a, b); }
589[[gnu::always_inline]] nce int16x4_t add_halve_round(int16x4_t a, int16x4_t b) { return vrhadd_s16(a, b); }
590[[gnu::always_inline]] nce int16x4_t add_saturate(int16x4_t a, int16x4_t b) { return vqadd_s16(a, b); }
591[[gnu::always_inline]] nce int16x4_t multiply(int16x4_t a, int16x4_t b) { return vmul_s16(a, b); }
592[[gnu::always_inline]] nce int16x4_t multiply_add(int16x4_t a, int16x4_t b, int16x4_t c) { return vmla_s16(a, b, c); }
593[[gnu::always_inline]] nce int16x4_t multiply_subtract(int16x4_t a, int16x4_t b, int16x4_t c) { return vmls_s16(a, b, c); }
594[[gnu::always_inline]] nce int16x4_t multiply_double_saturate_high(int16x4_t a, int16x4_t b) { return vqdmulh_s16(a, b); }
595[[gnu::always_inline]] nce int16x4_t multiply_double_round_saturate_high(int16x4_t a, int16x4_t b) { return vqrdmulh_s16(a, b); }
596[[gnu::always_inline]] nce int32x4_t multiply_double_saturate_long(int16x4_t a, int16x4_t b) { return vqdmull_s16(a, b); }
597[[gnu::always_inline]] nce int32x4_t multiply_long(int16x4_t a, int16x4_t b) { return vmull_s16(a, b); }
598template <int lane> [[gnu::always_inline]] nce int32x4_t multiply_double_saturate_long_lane(int16x4_t a, int16x4_t v) { return vqdmull_lane_s16(a, v, lane); }
599template <int lane> [[gnu::always_inline]] nce int16x4_t multiply_double_saturate_high_lane(int16x4_t a, int16x4_t v) { return vqdmulh_lane_s16(a, v, lane); }
600template <int lane> [[gnu::always_inline]] nce int16x4_t multiply_double_round_saturate_high_lane(int16x4_t a, int16x4_t v) { return vqrdmulh_lane_s16(a, v, lane); }
601[[gnu::always_inline]] nce int16x4_t subtract(int16x4_t a, int16x4_t b) { return vsub_s16(a, b); }
602[[gnu::always_inline]] nce int32x4_t subtract_long(int16x4_t a, int16x4_t b) { return vsubl_s16(a, b); }
603[[gnu::always_inline]] nce int16x4_t subtract_halve(int16x4_t a, int16x4_t b) { return vhsub_s16(a, b); }
604[[gnu::always_inline]] nce int16x4_t subtract_saturate(int16x4_t a, int16x4_t b) { return vqsub_s16(a, b); }
605[[gnu::always_inline]] nce int16x4_t subtract_absolute(int16x4_t a, int16x4_t b) { return vabd_s16(a, b); }
606[[gnu::always_inline]] nce int32x4_t subtract_absolute_long(int16x4_t a, int16x4_t b) { return vabdl_s16(a, b); }
607[[gnu::always_inline]] nce int16x4_t subtract_absolute_add(int16x4_t a, int16x4_t b, int16x4_t c) { return vaba_s16(a, b, c); }
608[[gnu::always_inline]] nce int32x4_t multiply_double_saturate_long(int16x4_t a, int16_t b) { return vqdmull_n_s16(a, b); }
609[[gnu::always_inline]] nce int16x4_t multiply_double_saturate_high(int16x4_t a, int16_t b) { return vqdmulh_n_s16(a, b); }
610[[gnu::always_inline]] nce int16x4_t multiply_double_round_saturate_high(int16x4_t a, int16_t b) { return vqrdmulh_n_s16(a, b); }
611[[gnu::always_inline]] nce int16x4_t absolute(int16x4_t a) { return vabs_s16(a); }
612[[gnu::always_inline]] nce int16x4_t absolute_saturate(int16x4_t a) { return vqabs_s16(a); }
613[[gnu::always_inline]] nce int16x4_t max(int16x4_t a, int16x4_t b) { return vmax_s16(a, b); }
614[[gnu::always_inline]] nce int16x4_t min(int16x4_t a, int16x4_t b) { return vmin_s16(a, b); }
615[[gnu::always_inline]] nce int16x4_t pairwise_add(int16x4_t a, int16x4_t b) { return vpadd_s16(a, b); }
616[[gnu::always_inline]] nce int32x2_t pairwise_add_long(int16x4_t a) { return vpaddl_s16(a); }
617[[gnu::always_inline]] nce int16x4_t pairwise_max(int16x4_t a, int16x4_t b) { return vpmax_s16(a, b); }
618[[gnu::always_inline]] nce int16x4_t pairwise_min(int16x4_t a, int16x4_t b) { return vpmin_s16(a, b); }
619[[gnu::always_inline]] nce uint16x4_t equal(int16x4_t a, int16x4_t b) { return vceq_s16(a, b); }
620[[gnu::always_inline]] nce uint16x4_t greater_than_or_equal(int16x4_t a, int16x4_t b) { return vcge_s16(a, b); }
621[[gnu::always_inline]] nce uint16x4_t less_than_or_equal(int16x4_t a, int16x4_t b) { return vcle_s16(a, b); }
622[[gnu::always_inline]] nce uint16x4_t greater_than(int16x4_t a, int16x4_t b) { return vcgt_s16(a, b); }
623[[gnu::always_inline]] nce uint16x4_t less_than(int16x4_t a, int16x4_t b) { return vclt_s16(a, b); }
624[[gnu::always_inline]] nce uint16x4_t compare_test_nonzero(int16x4_t a, int16x4_t b) { return vtst_s16(a, b); }
625[[gnu::always_inline]] nce int16x4_t shift_left(int16x4_t a, int16x4_t b) { return vshl_s16(a, b); }
626template <int n>[[gnu::always_inline]] nce int16x4_t shift_left(int16x4_t a) { return vshl_n_s16(a, n); }
627[[gnu::always_inline]] nce int16x4_t shift_left_saturate(int16x4_t a, int16x4_t b) { return vqshl_s16(a, b); }
628template <int n>[[gnu::always_inline]] nce int16x4_t shift_left_saturate(int16x4_t a) { return vqshl_n_s16(a, n); }
629template <int n>[[gnu::always_inline]] nce uint16x4_t shift_left_unsigned_saturate(int16x4_t a) { return vqshlu_n_s16(a, n); }
630[[gnu::always_inline]] nce int16x4_t shift_left_round(int16x4_t a, int16x4_t b) { return vrshl_s16(a, b); }
631[[gnu::always_inline]] nce int16x4_t shift_left_round_saturate(int16x4_t a, int16x4_t b) { return vqrshl_s16(a, b); }
632template <int n>[[gnu::always_inline]] nce int32x4_t shift_left_long(int16x4_t a) { return vshll_n_s16(a, n); }
633template <int n>[[gnu::always_inline]] nce int16x4_t shift_left_insert(int16x4_t a, int16x4_t b) { return vsli_n_s16(a, b, n); }
634template <int n>[[gnu::always_inline]] nce int16x4_t shift_right(int16x4_t a) { return vshr_n_s16(a, n); }
635template <int n>[[gnu::always_inline]] nce int16x4_t shift_right_round(int16x4_t a) { return vrshr_n_s16(a, n); }
636template <int n>[[gnu::always_inline]] nce int16x4_t shift_right_accumulate(int16x4_t a, int16x4_t b) { return vsra_n_s16(a, b, n); }
637template <int n>[[gnu::always_inline]] nce int16x4_t shift_right_accumulate_round(int16x4_t a, int16x4_t b) { return vrsra_n_s16(a, b, n); }
638template <int n>[[gnu::always_inline]] nce int16x4_t shift_right_insert(int16x4_t a, int16x4_t b) { return vsri_n_s16(a, b, n); }
639template <> [[gnu::always_inline]] nce int8x8_t reinterpret(int16x4_t a) { return vreinterpret_s8_s16(a); }
640template <> [[gnu::always_inline]] nce int32x2_t reinterpret(int16x4_t a) { return vreinterpret_s32_s16(a); }
641template <> [[gnu::always_inline]] nce float32x2_t reinterpret(int16x4_t a) { return vreinterpret_f32_s16(a); }
642template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(int16x4_t a) { return vreinterpret_u8_s16(a); }
643template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(int16x4_t a) { return vreinterpret_u16_s16(a); }
644template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(int16x4_t a) { return vreinterpret_u32_s16(a); }
645template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(int16x4_t a) { return vreinterpret_p8_s16(a); }
646template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(int16x4_t a) { return vreinterpret_p16_s16(a); }
647template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(int16x4_t a) { return vreinterpret_u64_s16(a); }
648template <> [[gnu::always_inline]] nce int64x1_t reinterpret(int16x4_t a) { return vreinterpret_s64_s16(a); }
649[[gnu::always_inline]] nce int32x4_t move_long(int16x4_t a) { return vmovl_s16(a); }
650template <int lane> [[gnu::always_inline]] nce int16x4_t multiply_add_lane(int16x4_t a, int16x4_t b, int16x4_t v) { return vmla_lane_s16(a, b, v, lane); }
651template <int lane> [[gnu::always_inline]] nce int16x4_t multiply_subtract_lane(int16x4_t a, int16x4_t b, int16x4_t v) { return vmls_lane_s16(a, b, v, lane); }
652[[gnu::always_inline]] nce int16x4_t multiply_add(int16x4_t a, int16x4_t b, int16_t c) { return vmla_n_s16(a, b, c); }
653template <int lane> [[gnu::always_inline]] nce int16x4_t multiply_lane(int16x4_t a, int16x4_t v) { return vmul_lane_s16(a, v, lane); }
654template <int lane> [[gnu::always_inline]] nce int32x4_t multiply_long_lane(int16x4_t a, int16x4_t v) { return vmull_lane_s16(a, v, lane); }
655[[gnu::always_inline]] nce int16x4_t multiply_subtract(int16x4_t a, int16x4_t b, int16_t c) { return vmls_n_s16(a, b, c); }
656[[gnu::always_inline]] nce int16x4_t negate(int16x4_t a) { return vneg_s16(a); }
657[[gnu::always_inline]] nce int16x4_t negate_saturate(int16x4_t a) { return vqneg_s16(a); }
658[[gnu::always_inline]] nce int16x4_t bitwise_not(int16x4_t a) { return vmvn_s16(a); }
659[[gnu::always_inline]] nce int16x4_t bitwise_and(int16x4_t a, int16x4_t b) { return vand_s16(a, b); }
660[[gnu::always_inline]] nce int16x4_t bitwise_or(int16x4_t a, int16x4_t b) { return vorr_s16(a, b); }
661[[gnu::always_inline]] nce int16x4_t bitwise_xor(int16x4_t a, int16x4_t b) { return veor_s16(a, b); }
662[[gnu::always_inline]] nce int16x4_t bitwise_or_not(int16x4_t a, int16x4_t b) { return vorn_s16(a, b); }
663[[gnu::always_inline]] nce int16x4_t count_leading_sign_bits(int16x4_t a) { return vcls_s16(a); }
664[[gnu::always_inline]] nce int16x4_t count_leading_zero_bits(int16x4_t a) { return vclz_s16(a); }
665[[gnu::always_inline]] nce int16x4_t bitwise_clear(int16x4_t a, int16x4_t b) { return vbic_s16(a, b); }
666template <int lane>[[gnu::always_inline]] nce int16x4_t duplicate_lane(int16x4_t a) { return vdup_lane_s16(a, lane); }
667template <int lane>[[gnu::always_inline]] nce int16x8_t duplicate_lane_quad(int16x4_t a) { return vdupq_lane_s16(a, lane); }
668[[gnu::always_inline]] nce int16x8_t combine(int16x4_t low, int16x4_t high) { return vcombine_s16(low, high); }
669template <int lane>[[gnu::always_inline]] nce int16_t get_lane(int16x4_t v) { return vget_lane_s16(v, lane); }
670template <int n>[[gnu::always_inline]] nce int16x4_t extract(int16x4_t a, int16x4_t b) { return vext_s16(a, b, n); }
671[[gnu::always_inline]] nce int16x4_t reverse_64bit(int16x4_t a) { return vrev64_s16(a); }
672[[gnu::always_inline]] nce int16x4_t reverse_32bit(int16x4_t a) { return vrev32_s16(a); }
673[[gnu::always_inline]] nce int16x4x2_t zip(int16x4_t a, int16x4_t b) { return vzip_s16(a, b); }
674[[gnu::always_inline]] nce int16x4x2_t unzip(int16x4_t a, int16x4_t b) { return vuzp_s16(a, b); }
675[[gnu::always_inline]] nce int16x4x2_t transpose(int16x4_t a, int16x4_t b) { return vtrn_s16(a, b); }
676[[gnu::always_inline]] nce int16x4_t multiply(int16x4_t a, int16_t b) { return vmul_n_s16(a, b); }
677[[gnu::always_inline]] nce int32x4_t multiply_long(int16x4_t a, int16_t b) { return vmull_n_s16(a, b); }
678[[gnu::always_inline]] nce int16x8_t add(int16x8_t a, int8x8_t b) { return vaddw_s8(a, b); }
679[[gnu::always_inline]] nce int16x8_t multiply_add_long(int16x8_t a, int8x8_t b, int8x8_t c) { return vmlal_s8(a, b, c); }
680[[gnu::always_inline]] nce int16x8_t multiply_subtract_long(int16x8_t a, int8x8_t b, int8x8_t c) { return vmlsl_s8(a, b, c); }
681[[gnu::always_inline]] nce int16x8_t subtract(int16x8_t a, int8x8_t b) { return vsubw_s8(a, b); }
682[[gnu::always_inline]] nce int16x8_t subtract_absolute_add(int16x8_t a, int8x8_t b, int8x8_t c) { return vabal_s8(a, b, c); }
683template <int lane> [[gnu::always_inline]] nce int16x8_t multiply_double_saturate_high_lane(int16x8_t a, int16x4_t v) { return vqdmulhq_lane_s16(a, v, lane); }
684template <int lane> [[gnu::always_inline]] nce int16x8_t multiply_double_round_saturate_high_lane(int16x8_t a, int16x4_t v) { return vqrdmulhq_lane_s16(a, v, lane); }
685[[gnu::always_inline]] nce int8x8_t add_narrow(int16x8_t a, int16x8_t b) { return vaddhn_s16(a, b); }
686[[gnu::always_inline]] nce int8x8_t add_round_narrow(int16x8_t a, int16x8_t b) { return vraddhn_s16(a, b); }
687[[gnu::always_inline]] nce int16x8_t multiply_add(int16x8_t a, int16x8_t b, int16x8_t c) { return vmlaq_s16(a, b, c); }
688[[gnu::always_inline]] nce int16x8_t multiply_subtract(int16x8_t a, int16x8_t b, int16x8_t c) { return vmlsq_s16(a, b, c); }
689[[gnu::always_inline]] nce int8x8_t subtract_narrow(int16x8_t a, int16x8_t b) { return vsubhn_s16(a, b); }
690[[gnu::always_inline]] nce int8x8_t subtract_round_narrow(int16x8_t a, int16x8_t b) { return vrsubhn_s16(a, b); }
691[[gnu::always_inline]] nce int16x8_t subtract_absolute_add(int16x8_t a, int16x8_t b, int16x8_t c) { return vabaq_s16(a, b, c); }
692[[gnu::always_inline]] nce int32x4_t pairwise_add_long(int16x8_t a) { return vpaddlq_s16(a); }
693[[gnu::always_inline]] nce int16x8_t pairwise_add_accumulate_long(int16x8_t a, int8x16_t b) { return vpadalq_s8(a, b); }
694[[gnu::always_inline]] nce uint16x8_t equal(int16x8_t a, int16x8_t b) { return vceqq_s16(a, b); }
695[[gnu::always_inline]] nce uint16x8_t greater_than_or_equal(int16x8_t a, int16x8_t b) { return vcgeq_s16(a, b); }
696[[gnu::always_inline]] nce uint16x8_t less_than_or_equal(int16x8_t a, int16x8_t b) { return vcleq_s16(a, b); }
697[[gnu::always_inline]] nce uint16x8_t greater_than(int16x8_t a, int16x8_t b) { return vcgtq_s16(a, b); }
698[[gnu::always_inline]] nce uint16x8_t less_than(int16x8_t a, int16x8_t b) { return vcltq_s16(a, b); }
699[[gnu::always_inline]] nce uint16x8_t compare_test_nonzero(int16x8_t a, int16x8_t b) { return vtstq_s16(a, b); }
700template <int n>[[gnu::always_inline]] nce int16x8_t shift_left(int16x8_t a) { return vshlq_n_s16(a, n); }
701template <int n>[[gnu::always_inline]] nce uint16x8_t shift_left_unsigned_saturate(int16x8_t a) { return vqshluq_n_s16(a, n); }
702template <int n>[[gnu::always_inline]] nce int16x8_t shift_right_accumulate(int16x8_t a, int16x8_t b) { return vsraq_n_s16(a, b, n); }
703template <int n>[[gnu::always_inline]] nce int16x8_t shift_right_accumulate_round(int16x8_t a, int16x8_t b) { return vrsraq_n_s16(a, b, n); }
704template <int n>[[gnu::always_inline]] nce int8x8_t shift_right_narrow(int16x8_t a) { return vshrn_n_s16(a, n); }
705template <int n>[[gnu::always_inline]] nce uint8x8_t shift_right_saturate_narrow_unsigned(int16x8_t a) { return vqshrun_n_s16(a, n); }
706template <int n>[[gnu::always_inline]] nce int8x8_t shift_right_saturate_narrow(int16x8_t a) { return vqshrn_n_s16(a, n); }
707template <int n>[[gnu::always_inline]] nce uint8x8_t shift_right_round_saturate_narrow_unsigned(int16x8_t a) { return vqrshrun_n_s16(a, n); }
708template <int n>[[gnu::always_inline]] nce int8x8_t shift_right_round_saturate_narrow(int16x8_t a) { return vqrshrn_n_s16(a, n); }
709template <int n>[[gnu::always_inline]] nce int8x8_t shift_right_round_narrow(int16x8_t a) { return vrshrn_n_s16(a, n); }
710template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(int16x8_t a) { return vreinterpretq_p8_s16(a); }
711template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(int16x8_t a) { return vreinterpretq_p16_s16(a); }
712[[gnu::always_inline]] nce int8x8_t move_narrow(int16x8_t a) { return vmovn_s16(a); }
713[[gnu::always_inline]] nce int8x8_t move_saturate_narrow(int16x8_t a) { return vqmovn_s16(a); }
714[[gnu::always_inline]] nce uint8x8_t move_unsigned_saturate_narrow(int16x8_t a) { return vqmovun_s16(a); }
715template <int lane> [[gnu::always_inline]] nce int16x8_t multiply_lane(int16x8_t a, int16x4_t v) { return vmulq_lane_s16(a, v, lane); }
716template <int lane> [[gnu::always_inline]] nce int16x8_t multiply_add_lane(int16x8_t a, int16x8_t b, int16x4_t v) { return vmlaq_lane_s16(a, b, v, lane); }
717template <int lane> [[gnu::always_inline]] nce int16x8_t multiply_subtract_lane(int16x8_t a, int16x8_t b, int16x4_t v) { return vmlsq_lane_s16(a, b, v, lane); }
718[[gnu::always_inline]] nce int16x8_t multiply_add(int16x8_t a, int16x8_t b, int16_t c) { return vmlaq_n_s16(a, b, c); }
719[[gnu::always_inline]] nce int16x8_t multiply_subtract(int16x8_t a, int16x8_t b, int16_t c) { return vmlsq_n_s16(a, b, c); }
720[[gnu::always_inline]] nce int16x4_t get_high(int16x8_t a) { return vget_high_s16(a); }
721[[gnu::always_inline]] nce int16x4_t get_low(int16x8_t a) { return vget_low_s16(a); }
722template <int n>[[gnu::always_inline]] nce int16x8_t extract(int16x8_t a, int16x8_t b) { return vextq_s16(a, b, n); }
723[[gnu::always_inline]] nce int16x8x2_t zip(int16x8_t a, int16x8_t b) { return vzipq_s16(a, b); }
724[[gnu::always_inline]] nce int16x8x2_t unzip(int16x8_t a, int16x8_t b) { return vuzpq_s16(a, b); }
725[[gnu::always_inline]] nce int16x8x2_t transpose(int16x8_t a, int16x8_t b) { return vtrnq_s16(a, b); }
726[[gnu::always_inline]] nce int32x2_t add(int32x2_t a, int32x2_t b) { return vadd_s32(a, b); }
727[[gnu::always_inline]] nce int64x2_t add_long(int32x2_t a, int32x2_t b) { return vaddl_s32(a, b); }
728[[gnu::always_inline]] nce int32x2_t add_halve(int32x2_t a, int32x2_t b) { return vhadd_s32(a, b); }
729[[gnu::always_inline]] nce int32x2_t add_halve_round(int32x2_t a, int32x2_t b) { return vrhadd_s32(a, b); }
730[[gnu::always_inline]] nce int32x2_t add_saturate(int32x2_t a, int32x2_t b) { return vqadd_s32(a, b); }
731[[gnu::always_inline]] nce int32x2_t multiply(int32x2_t a, int32x2_t b) { return vmul_s32(a, b); }
732[[gnu::always_inline]] nce int32x2_t multiply_add(int32x2_t a, int32x2_t b, int32x2_t c) { return vmla_s32(a, b, c); }
733[[gnu::always_inline]] nce int32x2_t multiply_subtract(int32x2_t a, int32x2_t b, int32x2_t c) { return vmls_s32(a, b, c); }
734[[gnu::always_inline]] nce int32x2_t multiply_double_saturate_high(int32x2_t a, int32x2_t b) { return vqdmulh_s32(a, b); }
735[[gnu::always_inline]] nce int32x2_t multiply_double_round_saturate_high(int32x2_t a, int32x2_t b) { return vqrdmulh_s32(a, b); }
736[[gnu::always_inline]] nce int64x2_t multiply_double_saturate_long(int32x2_t a, int32x2_t b) { return vqdmull_s32(a, b); }
737[[gnu::always_inline]] nce int64x2_t multiply_long(int32x2_t a, int32x2_t b) { return vmull_s32(a, b); }
738template <int lane> [[gnu::always_inline]] nce int64x2_t multiply_double_saturate_long_lane(int32x2_t a, int32x2_t v) { return vqdmull_lane_s32(a, v, lane); }
739template <int lane> [[gnu::always_inline]] nce int32x2_t multiply_double_saturate_high_lane(int32x2_t a, int32x2_t v) { return vqdmulh_lane_s32(a, v, lane); }
740template <int lane> [[gnu::always_inline]] nce int32x2_t multiply_double_round_saturate_high_lane(int32x2_t a, int32x2_t v) { return vqrdmulh_lane_s32(a, v, lane); }
741[[gnu::always_inline]] nce int32x2_t subtract(int32x2_t a, int32x2_t b) { return vsub_s32(a, b); }
742[[gnu::always_inline]] nce int64x2_t subtract_long(int32x2_t a, int32x2_t b) { return vsubl_s32(a, b); }
743[[gnu::always_inline]] nce int32x2_t subtract_halve(int32x2_t a, int32x2_t b) { return vhsub_s32(a, b); }
744[[gnu::always_inline]] nce int32x2_t subtract_saturate(int32x2_t a, int32x2_t b) { return vqsub_s32(a, b); }
745[[gnu::always_inline]] nce int32x2_t subtract_absolute(int32x2_t a, int32x2_t b) { return vabd_s32(a, b); }
746[[gnu::always_inline]] nce int64x2_t subtract_absolute_long(int32x2_t a, int32x2_t b) { return vabdl_s32(a, b); }
747[[gnu::always_inline]] nce int32x2_t subtract_absolute_add(int32x2_t a, int32x2_t b, int32x2_t c) { return vaba_s32(a, b, c); }
748[[gnu::always_inline]] nce int64x2_t multiply_double_saturate_long(int32x2_t a, int32_t b) { return vqdmull_n_s32(a, b); }
749[[gnu::always_inline]] nce int32x2_t multiply_double_saturate_high(int32x2_t a, int32_t b) { return vqdmulh_n_s32(a, b); }
750[[gnu::always_inline]] nce int32x2_t multiply_double_round_saturate_high(int32x2_t a, int32_t b) { return vqrdmulh_n_s32(a, b); }
751[[gnu::always_inline]] nce int32x2_t absolute(int32x2_t a) { return vabs_s32(a); }
752[[gnu::always_inline]] nce int32x2_t absolute_saturate(int32x2_t a) { return vqabs_s32(a); }
753[[gnu::always_inline]] nce int32x2_t max(int32x2_t a, int32x2_t b) { return vmax_s32(a, b); }
754[[gnu::always_inline]] nce int32x2_t min(int32x2_t a, int32x2_t b) { return vmin_s32(a, b); }
755[[gnu::always_inline]] nce int32x2_t pairwise_add(int32x2_t a, int32x2_t b) { return vpadd_s32(a, b); }
756[[gnu::always_inline]] nce int64x1_t pairwise_add_long(int32x2_t a) { return vpaddl_s32(a); }
757[[gnu::always_inline]] nce int32x2_t pairwise_add_accumulate_long(int32x2_t a, int16x4_t b) { return vpadal_s16(a, b); }
758[[gnu::always_inline]] nce int32x2_t pairwise_max(int32x2_t a, int32x2_t b) { return vpmax_s32(a, b); }
759[[gnu::always_inline]] nce int32x2_t pairwise_min(int32x2_t a, int32x2_t b) { return vpmin_s32(a, b); }
760[[gnu::always_inline]] nce uint32x2_t equal(int32x2_t a, int32x2_t b) { return vceq_s32(a, b); }
761[[gnu::always_inline]] nce uint32x2_t greater_than_or_equal(int32x2_t a, int32x2_t b) { return vcge_s32(a, b); }
762[[gnu::always_inline]] nce uint32x2_t less_than_or_equal(int32x2_t a, int32x2_t b) { return vcle_s32(a, b); }
763[[gnu::always_inline]] nce uint32x2_t greater_than(int32x2_t a, int32x2_t b) { return vcgt_s32(a, b); }
764[[gnu::always_inline]] nce uint32x2_t less_than(int32x2_t a, int32x2_t b) { return vclt_s32(a, b); }
765[[gnu::always_inline]] nce uint32x2_t compare_test_nonzero(int32x2_t a, int32x2_t b) { return vtst_s32(a, b); }
766[[gnu::always_inline]] nce int32x2_t shift_left(int32x2_t a, int32x2_t b) { return vshl_s32(a, b); }
767template <int n>[[gnu::always_inline]] nce int32x2_t shift_left(int32x2_t a) { return vshl_n_s32(a, n); }
768[[gnu::always_inline]] nce int32x2_t shift_left_saturate(int32x2_t a, int32x2_t b) { return vqshl_s32(a, b); }
769template <int n>[[gnu::always_inline]] nce int32x2_t shift_left_saturate(int32x2_t a) { return vqshl_n_s32(a, n); }
770template <int n>[[gnu::always_inline]] nce uint32x2_t shift_left_unsigned_saturate(int32x2_t a) { return vqshlu_n_s32(a, n); }
771[[gnu::always_inline]] nce int32x2_t shift_left_round(int32x2_t a, int32x2_t b) { return vrshl_s32(a, b); }
772[[gnu::always_inline]] nce int32x2_t shift_left_round_saturate(int32x2_t a, int32x2_t b) { return vqrshl_s32(a, b); }
773template <int n>[[gnu::always_inline]] nce int64x2_t shift_left_long(int32x2_t a) { return vshll_n_s32(a, n); }
774template <int n>[[gnu::always_inline]] nce int32x2_t shift_left_insert(int32x2_t a, int32x2_t b) { return vsli_n_s32(a, b, n); }
775template <int n>[[gnu::always_inline]] nce int32x2_t shift_right(int32x2_t a) { return vshr_n_s32(a, n); }
776template <int n>[[gnu::always_inline]] nce int32x2_t shift_right_round(int32x2_t a) { return vrshr_n_s32(a, n); }
777template <int n>[[gnu::always_inline]] nce int32x2_t shift_right_accumulate(int32x2_t a, int32x2_t b) { return vsra_n_s32(a, b, n); }
778template <int n>[[gnu::always_inline]] nce int32x2_t shift_right_accumulate_round(int32x2_t a, int32x2_t b) { return vrsra_n_s32(a, b, n); }
779template <int n>[[gnu::always_inline]] nce int32x2_t shift_right_insert(int32x2_t a, int32x2_t b) { return vsri_n_s32(a, b, n); }
780template <> [[gnu::always_inline]] nce float32x2_t convert(int32x2_t a) { return vcvt_f32_s32(a); }
781template <int fracbits> [[gnu::always_inline]] nce float32x2_t convert_n(int32x2_t a) { return vcvt_n_f32_s32(a, fracbits); }
782template <> [[gnu::always_inline]] nce int8x8_t reinterpret(int32x2_t a) { return vreinterpret_s8_s32(a); }
783template <> [[gnu::always_inline]] nce int16x4_t reinterpret(int32x2_t a) { return vreinterpret_s16_s32(a); }
784template <> [[gnu::always_inline]] nce float32x2_t reinterpret(int32x2_t a) { return vreinterpret_f32_s32(a); }
785template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(int32x2_t a) { return vreinterpret_u8_s32(a); }
786template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(int32x2_t a) { return vreinterpret_u16_s32(a); }
787template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(int32x2_t a) { return vreinterpret_u32_s32(a); }
788template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(int32x2_t a) { return vreinterpret_p8_s32(a); }
789template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(int32x2_t a) { return vreinterpret_p16_s32(a); }
790template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(int32x2_t a) { return vreinterpret_u64_s32(a); }
791template <> [[gnu::always_inline]] nce int64x1_t reinterpret(int32x2_t a) { return vreinterpret_s64_s32(a); }
792[[gnu::always_inline]] nce int64x2_t move_long(int32x2_t a) { return vmovl_s32(a); }
793template <int lane> [[gnu::always_inline]] nce int32x2_t multiply_add_lane(int32x2_t a, int32x2_t b, int32x2_t v) { return vmla_lane_s32(a, b, v, lane); }
794template <int lane> [[gnu::always_inline]] nce int32x2_t multiply_subtract_lane(int32x2_t a, int32x2_t b, int32x2_t v) { return vmls_lane_s32(a, b, v, lane); }
795[[gnu::always_inline]] nce int32x2_t multiply_add(int32x2_t a, int32x2_t b, int32_t c) { return vmla_n_s32(a, b, c); }
796template <int lane> [[gnu::always_inline]] nce int32x2_t multiply_lane(int32x2_t a, int32x2_t v) { return vmul_lane_s32(a, v, lane); }
797template <int lane> [[gnu::always_inline]] nce int64x2_t multiply_long_lane(int32x2_t a, int32x2_t v) { return vmull_lane_s32(a, v, lane); }
798[[gnu::always_inline]] nce int32x2_t multiply_subtract(int32x2_t a, int32x2_t b, int32_t c) { return vmls_n_s32(a, b, c); }
799[[gnu::always_inline]] nce int32x2_t negate(int32x2_t a) { return vneg_s32(a); }
800[[gnu::always_inline]] nce int32x2_t negate_saturate(int32x2_t a) { return vqneg_s32(a); }
801[[gnu::always_inline]] nce int32x2_t bitwise_not(int32x2_t a) { return vmvn_s32(a); }
802[[gnu::always_inline]] nce int32x2_t bitwise_and(int32x2_t a, int32x2_t b) { return vand_s32(a, b); }
803[[gnu::always_inline]] nce int32x2_t bitwise_or(int32x2_t a, int32x2_t b) { return vorr_s32(a, b); }
804[[gnu::always_inline]] nce int32x2_t bitwise_xor(int32x2_t a, int32x2_t b) { return veor_s32(a, b); }
805[[gnu::always_inline]] nce int32x2_t bitwise_or_not(int32x2_t a, int32x2_t b) { return vorn_s32(a, b); }
806[[gnu::always_inline]] nce int32x2_t count_leading_sign_bits(int32x2_t a) { return vcls_s32(a); }
807[[gnu::always_inline]] nce int32x2_t count_leading_zero_bits(int32x2_t a) { return vclz_s32(a); }
808[[gnu::always_inline]] nce int32x2_t bitwise_clear(int32x2_t a, int32x2_t b) { return vbic_s32(a, b); }
809template <int lane>[[gnu::always_inline]] nce int32x2_t duplicate_lane(int32x2_t a) { return vdup_lane_s32(a, lane); }
810template <int lane>[[gnu::always_inline]] nce int32x4_t duplicate_lane_quad(int32x2_t a) { return vdupq_lane_s32(a, lane); }
811[[gnu::always_inline]] nce int32x4_t combine(int32x2_t low, int32x2_t high) { return vcombine_s32(low, high); }
812template <int lane>[[gnu::always_inline]] nce int32_t get_lane(int32x2_t v) { return vget_lane_s32(v, lane); }
813template <int n>[[gnu::always_inline]] nce int32x2_t extract(int32x2_t a, int32x2_t b) { return vext_s32(a, b, n); }
814[[gnu::always_inline]] nce int32x2_t reverse_64bit(int32x2_t a) { return vrev64_s32(a); }
815[[gnu::always_inline]] nce int32x2x2_t zip(int32x2_t a, int32x2_t b) { return vzip_s32(a, b); }
816[[gnu::always_inline]] nce int32x2x2_t unzip(int32x2_t a, int32x2_t b) { return vuzp_s32(a, b); }
817[[gnu::always_inline]] nce int32x2x2_t transpose(int32x2_t a, int32x2_t b) { return vtrn_s32(a, b); }
818[[gnu::always_inline]] nce int32x2_t multiply(int32x2_t a, int32_t b) { return vmul_n_s32(a, b); }
819[[gnu::always_inline]] nce int64x2_t multiply_long(int32x2_t a, int32_t b) { return vmull_n_s32(a, b); }
820[[gnu::always_inline]] nce int32x4_t add(int32x4_t a, int16x4_t b) { return vaddw_s16(a, b); }
821[[gnu::always_inline]] nce int32x4_t multiply_add_long(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlal_s16(a, b, c); }
822[[gnu::always_inline]] nce int32x4_t multiply_subtract_long(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlsl_s16(a, b, c); }
823[[gnu::always_inline]] nce int32x4_t multiply_double_add_saturate_long(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlal_s16(a, b, c); }
824[[gnu::always_inline]] nce int32x4_t multiply_double_subtract_saturate_long(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlsl_s16(a, b, c); }
825template <int lane> [[gnu::always_inline]] nce int32x4_t multiply_double_add_saturate_long_lane(int32x4_t a, int16x4_t b, int16x4_t v) { return vqdmlal_lane_s16(a, b, v, lane); }
826template <int lane> [[gnu::always_inline]] nce int32x4_t multiply_double_subtract_saturate_long_lane(int32x4_t a, int16x4_t b, int16x4_t v) { return vqdmlsl_lane_s16(a, b, v, lane); }
827[[gnu::always_inline]] nce int32x4_t subtract_absolute_add(int32x4_t a, int16x4_t b, int16x4_t c) { return vabal_s16(a, b, c); }
828[[gnu::always_inline]] nce int32x4_t multiply_double_add_saturate_long(int32x4_t a, int16x4_t b, int16_t c) { return vqdmlal_n_s16(a, b, c); }
829[[gnu::always_inline]] nce int32x4_t multiply_double_subtract_saturate_long(int32x4_t a, int16x4_t b, int16_t c) { return vqdmlsl_n_s16(a, b, c); }
830[[gnu::always_inline]] nce int32x4_t subtract(int32x4_t a, int16x4_t b) { return vsubw_s16(a, b); }
831template <int lane> [[gnu::always_inline]] nce int32x4_t multiply_double_saturate_high_lane(int32x4_t a, int32x2_t v) { return vqdmulhq_lane_s32(a, v, lane); }
832template <int lane> [[gnu::always_inline]] nce int32x4_t multiply_double_round_saturate_high_lane(int32x4_t a, int32x2_t v) { return vqrdmulhq_lane_s32(a, v, lane); }
833[[gnu::always_inline]] nce int16x4_t add_narrow(int32x4_t a, int32x4_t b) { return vaddhn_s32(a, b); }
834[[gnu::always_inline]] nce int16x4_t add_round_narrow(int32x4_t a, int32x4_t b) { return vraddhn_s32(a, b); }
835[[gnu::always_inline]] nce int32x4_t multiply_add(int32x4_t a, int32x4_t b, int32x4_t c) { return vmlaq_s32(a, b, c); }
836[[gnu::always_inline]] nce int32x4_t multiply_subtract(int32x4_t a, int32x4_t b, int32x4_t c) { return vmlsq_s32(a, b, c); }
837[[gnu::always_inline]] nce int16x4_t subtract_narrow(int32x4_t a, int32x4_t b) { return vsubhn_s32(a, b); }
838[[gnu::always_inline]] nce int16x4_t subtract_round_narrow(int32x4_t a, int32x4_t b) { return vrsubhn_s32(a, b); }
839[[gnu::always_inline]] nce int32x4_t subtract_absolute_add(int32x4_t a, int32x4_t b, int32x4_t c) { return vabaq_s32(a, b, c); }
840[[gnu::always_inline]] nce int64x2_t pairwise_add_long(int32x4_t a) { return vpaddlq_s32(a); }
841[[gnu::always_inline]] nce int32x4_t pairwise_add_accumulate_long(int32x4_t a, int16x8_t b) { return vpadalq_s16(a, b); }
842[[gnu::always_inline]] nce uint32x4_t equal(int32x4_t a, int32x4_t b) { return vceqq_s32(a, b); }
843[[gnu::always_inline]] nce uint32x4_t greater_than_or_equal(int32x4_t a, int32x4_t b) { return vcgeq_s32(a, b); }
844[[gnu::always_inline]] nce uint32x4_t less_than_or_equal(int32x4_t a, int32x4_t b) { return vcleq_s32(a, b); }
845[[gnu::always_inline]] nce uint32x4_t greater_than(int32x4_t a, int32x4_t b) { return vcgtq_s32(a, b); }
846[[gnu::always_inline]] nce uint32x4_t less_than(int32x4_t a, int32x4_t b) { return vcltq_s32(a, b); }
847[[gnu::always_inline]] nce uint32x4_t compare_test_nonzero(int32x4_t a, int32x4_t b) { return vtstq_s32(a, b); }
848template <int n>[[gnu::always_inline]] nce int32x4_t shift_left(int32x4_t a) { return vshlq_n_s32(a, n); }
849template <int n>[[gnu::always_inline]] nce uint32x4_t shift_left_unsigned_saturate(int32x4_t a) { return vqshluq_n_s32(a, n); }
850template <int n>[[gnu::always_inline]] nce int32x4_t shift_right_accumulate(int32x4_t a, int32x4_t b) { return vsraq_n_s32(a, b, n); }
851template <int n>[[gnu::always_inline]] nce int32x4_t shift_right_accumulate_round(int32x4_t a, int32x4_t b) { return vrsraq_n_s32(a, b, n); }
852template <int n>[[gnu::always_inline]] nce int16x4_t shift_right_narrow(int32x4_t a) { return vshrn_n_s32(a, n); }
853template <int n>[[gnu::always_inline]] nce uint16x4_t shift_right_saturate_narrow_unsigned(int32x4_t a) { return vqshrun_n_s32(a, n); }
854template <int n>[[gnu::always_inline]] nce int16x4_t shift_right_saturate_narrow(int32x4_t a) { return vqshrn_n_s32(a, n); }
855template <int n>[[gnu::always_inline]] nce uint16x4_t shift_right_round_saturate_narrow_unsigned(int32x4_t a) { return vqrshrun_n_s32(a, n); }
856template <int n>[[gnu::always_inline]] nce int16x4_t shift_right_round_saturate_narrow(int32x4_t a) { return vqrshrn_n_s32(a, n); }
857template <int n>[[gnu::always_inline]] nce int16x4_t shift_right_round_narrow(int32x4_t a) { return vrshrn_n_s32(a, n); }
858template <int fracbits> [[gnu::always_inline]] nce float32x4_t convert_n(int32x4_t a) { return vcvtq_n_f32_s32(a, fracbits); }
859template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(int32x4_t a) { return vreinterpretq_p8_s32(a); }
860template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(int32x4_t a) { return vreinterpretq_p16_s32(a); }
861[[gnu::always_inline]] nce int16x4_t move_narrow(int32x4_t a) { return vmovn_s32(a); }
862[[gnu::always_inline]] nce int16x4_t move_saturate_narrow(int32x4_t a) { return vqmovn_s32(a); }
863[[gnu::always_inline]] nce uint16x4_t move_unsigned_saturate_narrow(int32x4_t a) { return vqmovun_s32(a); }
864template <int lane> [[gnu::always_inline]] nce int32x4_t multiply_add_long_lane(int32x4_t a, int16x4_t b, int16x4_t v) { return vmlal_lane_s16(a, b, v, lane); }
865template <int lane> [[gnu::always_inline]] nce int32x4_t multiply_subtract_long_lane(int32x4_t a, int16x4_t b, int16x4_t v) { return vmlsl_lane_s16(a, b, v, lane); }
866[[gnu::always_inline]] nce int32x4_t multiply_add_long(int32x4_t a, int16x4_t b, int16_t c) { return vmlal_n_s16(a, b, c); }
867[[gnu::always_inline]] nce int32x4_t multiply_subtract_long(int32x4_t a, int16x4_t b, int16_t c) { return vmlsl_n_s16(a, b, c); }
868template <int lane> [[gnu::always_inline]] nce int32x4_t multiply_lane(int32x4_t a, int32x2_t v) { return vmulq_lane_s32(a, v, lane); }
869template <int lane> [[gnu::always_inline]] nce int32x4_t multiply_add_lane(int32x4_t a, int32x4_t b, int32x2_t v) { return vmlaq_lane_s32(a, b, v, lane); }
870template <int lane> [[gnu::always_inline]] nce int32x4_t multiply_subtract_lane(int32x4_t a, int32x4_t b, int32x2_t v) { return vmlsq_lane_s32(a, b, v, lane); }
871[[gnu::always_inline]] nce int32x4_t multiply_add(int32x4_t a, int32x4_t b, int32_t c) { return vmlaq_n_s32(a, b, c); }
872[[gnu::always_inline]] nce int32x4_t multiply_subtract(int32x4_t a, int32x4_t b, int32_t c) { return vmlsq_n_s32(a, b, c); }
873[[gnu::always_inline]] nce int32x2_t get_high(int32x4_t a) { return vget_high_s32(a); }
874[[gnu::always_inline]] nce int32x2_t get_low(int32x4_t a) { return vget_low_s32(a); }
875template <int n>[[gnu::always_inline]] nce int32x4_t extract(int32x4_t a, int32x4_t b) { return vextq_s32(a, b, n); }
876[[gnu::always_inline]] nce int32x4x2_t zip(int32x4_t a, int32x4_t b) { return vzipq_s32(a, b); }
877[[gnu::always_inline]] nce int32x4x2_t unzip(int32x4_t a, int32x4_t b) { return vuzpq_s32(a, b); }
878[[gnu::always_inline]] nce int32x4x2_t transpose(int32x4_t a, int32x4_t b) { return vtrnq_s32(a, b); }
879[[gnu::always_inline]] nce uint64x1_t add(uint64x1_t a, uint64x1_t b) { return vadd_u64(a, b); }
880[[gnu::always_inline]] nce uint64x1_t add_saturate(uint64x1_t a, uint64x1_t b) { return vqadd_u64(a, b); }
881[[gnu::always_inline]] nce uint64x1_t subtract(uint64x1_t a, uint64x1_t b) { return vsub_u64(a, b); }
882[[gnu::always_inline]] nce uint64x1_t subtract_saturate(uint64x1_t a, uint64x1_t b) { return vqsub_u64(a, b); }
883[[gnu::always_inline]] nce uint64x1_t pairwise_add_accumulate_long(uint64x1_t a, uint32x2_t b) { return vpadal_u32(a, b); }
884[[gnu::always_inline]] nce uint64x1_t shift_left(uint64x1_t a, int64x1_t b) { return vshl_u64(a, b); }
885template <int n>[[gnu::always_inline]] nce uint64x1_t shift_left(uint64x1_t a) { return vshl_n_u64(a, n); }
886template <int n>[[gnu::always_inline]] nce uint64x1_t shift_right(uint64x1_t a) { return vshr_n_u64(a, n); }
887template <int n>[[gnu::always_inline]] nce uint64x1_t shift_right_round(uint64x1_t a) { return vrshr_n_u64(a, n); }
888template <int n>[[gnu::always_inline]] nce uint64x1_t shift_right_accumulate(uint64x1_t a, uint64x1_t b) { return vsra_n_u64(a, b, n); }
889template <int n>[[gnu::always_inline]] nce uint64x1_t shift_right_accumulate_round(uint64x1_t a, uint64x1_t b) { return vrsra_n_u64(a, b, n); }
890template <int n>[[gnu::always_inline]] nce uint64x1_t shift_right_insert(uint64x1_t a, uint64x1_t b) { return vsri_n_u64(a, b, n); }
891[[gnu::always_inline]] nce uint64x1_t shift_left_saturate(uint64x1_t a, int64x1_t b) { return vqshl_u64(a, b); }
892template <int n>[[gnu::always_inline]] nce uint64x1_t shift_left_saturate(uint64x1_t a) { return vqshl_n_u64(a, n); }
893template <int n>[[gnu::always_inline]] nce uint64x1_t shift_left_insert(uint64x1_t a, uint64x1_t b) { return vsli_n_u64(a, b, n); }
894[[gnu::always_inline]] nce uint64x1_t shift_left_round(uint64x1_t a, int64x1_t b) { return vrshl_u64(a, b); }
895[[gnu::always_inline]] nce uint64x1_t shift_left_round_saturate(uint64x1_t a, int64x1_t b) { return vqrshl_u64(a, b); }
896template <> [[gnu::always_inline]] nce int8x8_t reinterpret(uint64x1_t a) { return vreinterpret_s8_u64(a); }
897template <> [[gnu::always_inline]] nce int16x4_t reinterpret(uint64x1_t a) { return vreinterpret_s16_u64(a); }
898template <> [[gnu::always_inline]] nce int32x2_t reinterpret(uint64x1_t a) { return vreinterpret_s32_u64(a); }
899template <> [[gnu::always_inline]] nce float32x2_t reinterpret(uint64x1_t a) { return vreinterpret_f32_u64(a); }
900template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(uint64x1_t a) { return vreinterpret_u8_u64(a); }
901template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(uint64x1_t a) { return vreinterpret_u16_u64(a); }
902template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(uint64x1_t a) { return vreinterpret_u32_u64(a); }
903template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(uint64x1_t a) { return vreinterpret_p8_u64(a); }
904template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(uint64x1_t a) { return vreinterpret_p16_u64(a); }
905template <> [[gnu::always_inline]] nce int64x1_t reinterpret(uint64x1_t a) { return vreinterpret_s64_u64(a); }
906[[gnu::always_inline]] nce uint64x1_t bitwise_and(uint64x1_t a, uint64x1_t b) { return vand_u64(a, b); }
907[[gnu::always_inline]] nce uint64x1_t bitwise_or(uint64x1_t a, uint64x1_t b) { return vorr_u64(a, b); }
908[[gnu::always_inline]] nce uint64x1_t bitwise_xor(uint64x1_t a, uint64x1_t b) { return veor_u64(a, b); }
909[[gnu::always_inline]] nce uint64x1_t bitwise_or_not(uint64x1_t a, uint64x1_t b) { return vorn_u64(a, b); }
910[[gnu::always_inline]] nce uint64x1_t bitwise_clear(uint64x1_t a, uint64x1_t b) { return vbic_u64(a, b); }
911[[gnu::always_inline]] nce uint64x1_t bitwise_select(uint64x1_t a, uint64x1_t b, uint64x1_t c) { return vbsl_u64(a, b, c); }
912template <int lane>[[gnu::always_inline]] nce uint64x1_t duplicate_lane(uint64x1_t a) { return vdup_lane_u64(a, lane); }
913template <int lane>[[gnu::always_inline]] nce uint64x2_t duplicate_lane_quad(uint64x1_t a) { return vdupq_lane_u64(a, lane); }
914[[gnu::always_inline]] nce uint64x2_t combine(uint64x1_t low, uint64x1_t high) { return vcombine_u64(low, high); }
915template <int lane>[[gnu::always_inline]] nce uint64_t get_lane(uint64x1_t v) { return vget_lane_u64(v, lane); }
916template <int n>[[gnu::always_inline]] nce uint64x1_t extract(uint64x1_t a, uint64x1_t b) { return vext_u64(a, b, n); }
917[[gnu::always_inline]] nce int64x1_t bitwise_select(uint64x1_t a, int64x1_t b, int64x1_t c) { return vbsl_s64(a, b, c); }
918[[gnu::always_inline]] nce uint64x2_t add(uint64x2_t a, uint64x2_t b) { return vaddq_u64(a, b); }
919[[gnu::always_inline]] nce uint32x2_t add_narrow(uint64x2_t a, uint64x2_t b) { return vaddhn_u64(a, b); }
920[[gnu::always_inline]] nce uint32x2_t add_round_narrow(uint64x2_t a, uint64x2_t b) { return vraddhn_u64(a, b); }
921[[gnu::always_inline]] nce uint64x2_t add_saturate(uint64x2_t a, uint64x2_t b) { return vqaddq_u64(a, b); }
922[[gnu::always_inline]] nce uint64x2_t subtract(uint64x2_t a, uint64x2_t b) { return vsubq_u64(a, b); }
923[[gnu::always_inline]] nce uint32x2_t subtract_narrow(uint64x2_t a, uint64x2_t b) { return vsubhn_u64(a, b); }
924[[gnu::always_inline]] nce uint32x2_t subtract_round_narrow(uint64x2_t a, uint64x2_t b) { return vrsubhn_u64(a, b); }
925[[gnu::always_inline]] nce uint64x2_t subtract_saturate(uint64x2_t a, uint64x2_t b) { return vqsubq_u64(a, b); }
926[[gnu::always_inline]] nce uint64x2_t shift_left_saturate(uint64x2_t a, int64x2_t b) { return vqshlq_u64(a, b); }
927template <int n>[[gnu::always_inline]] nce uint64x2_t shift_left_saturate(uint64x2_t a) { return vqshlq_n_u64(a, n); }
928template <int n>[[gnu::always_inline]] nce uint64x2_t shift_left_insert(uint64x2_t a, uint64x2_t b) { return vsliq_n_u64(a, b, n); }
929[[gnu::always_inline]] nce uint64x2_t add(uint64x2_t a, uint32x2_t b) { return vaddw_u32(a, b); }
930[[gnu::always_inline]] nce uint64x2_t multiply_add_long(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlal_u32(a, b, c); }
931[[gnu::always_inline]] nce uint64x2_t multiply_subtract_long(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlsl_u32(a, b, c); }
932[[gnu::always_inline]] nce uint64x2_t subtract(uint64x2_t a, uint32x2_t b) { return vsubw_u32(a, b); }
933[[gnu::always_inline]] nce uint64x2_t subtract_absolute_add(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vabal_u32(a, b, c); }
934[[gnu::always_inline]] nce uint64x2_t shift_left_round(uint64x2_t a, int64x2_t b) { return vrshlq_u64(a, b); }
935[[gnu::always_inline]] nce uint64x2_t shift_left_round_saturate(uint64x2_t a, int64x2_t b) { return vqrshlq_u64(a, b); }
936template <int n>[[gnu::always_inline]] nce uint64x2_t shift_right(uint64x2_t a) { return vshrq_n_u64(a, n); }
937template <int n>[[gnu::always_inline]] nce uint64x2_t shift_right_round(uint64x2_t a) { return vrshrq_n_u64(a, n); }
938template <int n>[[gnu::always_inline]] nce uint64x2_t shift_right_accumulate(uint64x2_t a, uint64x2_t b) { return vsraq_n_u64(a, b, n); }
939template <int n>[[gnu::always_inline]] nce uint64x2_t shift_right_accumulate_round(uint64x2_t a, uint64x2_t b) { return vrsraq_n_u64(a, b, n); }
940template <int n>[[gnu::always_inline]] nce uint32x2_t shift_right_narrow(uint64x2_t a) { return vshrn_n_u64(a, n); }
941template <int n>[[gnu::always_inline]] nce uint32x2_t shift_right_saturate_narrow(uint64x2_t a) { return vqshrn_n_u64(a, n); }
942template <int n>[[gnu::always_inline]] nce uint32x2_t shift_right_round_saturate_narrow(uint64x2_t a) { return vqrshrn_n_u64(a, n); }
943template <int n>[[gnu::always_inline]] nce uint32x2_t shift_right_round_narrow(uint64x2_t a) { return vrshrn_n_u64(a, n); }
944template <int n>[[gnu::always_inline]] nce uint64x2_t shift_right_insert(uint64x2_t a, uint64x2_t b) { return vsriq_n_u64(a, b, n); }
945template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(uint64x2_t a) { return vreinterpretq_p8_u64(a); }
946template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(uint64x2_t a) { return vreinterpretq_p16_u64(a); }
947[[gnu::always_inline]] nce uint32x2_t move_narrow(uint64x2_t a) { return vmovn_u64(a); }
948[[gnu::always_inline]] nce uint32x2_t move_saturate_narrow(uint64x2_t a) { return vqmovn_u64(a); }
949template <int lane> [[gnu::always_inline]] nce uint64x2_t multiply_add_long_lane(uint64x2_t a, uint32x2_t b, uint32x2_t v) { return vmlal_lane_u32(a, b, v, lane); }
950[[gnu::always_inline]] nce uint64x2_t pairwise_add_accumulate_long(uint64x2_t a, uint32x4_t b) { return vpadalq_u32(a, b); }
951[[gnu::always_inline]] nce uint64x2_t shift_left(uint64x2_t a, int64x2_t b) { return vshlq_u64(a, b); }
952template <int n>[[gnu::always_inline]] nce uint64x2_t shift_left(uint64x2_t a) { return vshlq_n_u64(a, n); }
953[[gnu::always_inline]] nce uint64x2_t bitwise_and(uint64x2_t a, uint64x2_t b) { return vandq_u64(a, b); }
954[[gnu::always_inline]] nce uint64x2_t bitwise_or(uint64x2_t a, uint64x2_t b) { return vorrq_u64(a, b); }
955[[gnu::always_inline]] nce uint64x2_t bitwise_xor(uint64x2_t a, uint64x2_t b) { return veorq_u64(a, b); }
956[[gnu::always_inline]] nce uint64x2_t bitwise_or_not(uint64x2_t a, uint64x2_t b) { return vornq_u64(a, b); }
957[[gnu::always_inline]] nce uint64x2_t bitwise_clear(uint64x2_t a, uint64x2_t b) { return vbicq_u64(a, b); }
958[[gnu::always_inline]] nce uint64x2_t bitwise_select(uint64x2_t a, uint64x2_t b, uint64x2_t c) { return vbslq_u64(a, b, c); }
959[[gnu::always_inline]] nce uint64x1_t get_high(uint64x2_t a) { return vget_high_u64(a); }
960[[gnu::always_inline]] nce uint64x1_t get_low(uint64x2_t a) { return vget_low_u64(a); }
961template <int n>[[gnu::always_inline]] nce uint64x2_t extract(uint64x2_t a, uint64x2_t b) { return vextq_u64(a, b, n); }
962template <int lane>[[gnu::always_inline]] nce uint64x2_t multiply_subtract_long_lane(uint64x2_t a, uint32x2_t b, uint32x2_t v) { return vmlsl_lane_u32(a, b, v, lane); }
963[[gnu::always_inline]] nce uint64x2_t multiply_add_long(uint64x2_t a, uint32x2_t b, uint32_t c) { return vmlal_n_u32(a, b, c); }
964[[gnu::always_inline]] nce uint64x2_t multiply_subtract_long(uint64x2_t a, uint32x2_t b, uint32_t c) { return vmlsl_n_u32(a, b, c); }
965[[gnu::always_inline]] nce int64x2_t bitwise_select(uint64x2_t a, int64x2_t b, int64x2_t c) { return vbslq_s64(a, b, c); }
966[[gnu::always_inline]] nce uint32x2_t shift_left(uint32x2_t a, int32x2_t b) { return vshl_u32(a, b); }
967[[gnu::always_inline]] nce uint32x2_t add(uint32x2_t a, uint32x2_t b) { return vadd_u32(a, b); }
968[[gnu::always_inline]] nce uint64x2_t add_long(uint32x2_t a, uint32x2_t b) { return vaddl_u32(a, b); }
969[[gnu::always_inline]] nce uint32x2_t add_halve(uint32x2_t a, uint32x2_t b) { return vhadd_u32(a, b); }
970[[gnu::always_inline]] nce uint32x2_t add_halve_round(uint32x2_t a, uint32x2_t b) { return vrhadd_u32(a, b); }
971[[gnu::always_inline]] nce uint32x2_t add_saturate(uint32x2_t a, uint32x2_t b) { return vqadd_u32(a, b); }
972[[gnu::always_inline]] nce uint32x2_t multiply(uint32x2_t a, uint32x2_t b) { return vmul_u32(a, b); }
973[[gnu::always_inline]] nce uint32x2_t multiply_add(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmla_u32(a, b, c); }
974[[gnu::always_inline]] nce uint32x2_t multiply_subtract(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmls_u32(a, b, c); }
975[[gnu::always_inline]] nce uint64x2_t multiply_long(uint32x2_t a, uint32x2_t b) { return vmull_u32(a, b); }
976[[gnu::always_inline]] nce uint32x2_t subtract(uint32x2_t a, uint32x2_t b) { return vsub_u32(a, b); }
977[[gnu::always_inline]] nce uint64x2_t subtract_long(uint32x2_t a, uint32x2_t b) { return vsubl_u32(a, b); }
978[[gnu::always_inline]] nce uint32x2_t subtract_halve(uint32x2_t a, uint32x2_t b) { return vhsub_u32(a, b); }
979[[gnu::always_inline]] nce uint32x2_t subtract_saturate(uint32x2_t a, uint32x2_t b) { return vqsub_u32(a, b); }
980[[gnu::always_inline]] nce uint32x2_t subtract_absolute(uint32x2_t a, uint32x2_t b) { return vabd_u32(a, b); }
981[[gnu::always_inline]] nce uint64x2_t subtract_absolute_long(uint32x2_t a, uint32x2_t b) { return vabdl_u32(a, b); }
982[[gnu::always_inline]] nce uint32x2_t subtract_absolute_add(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vaba_u32(a, b, c); }
983[[gnu::always_inline]] nce uint32x2_t max(uint32x2_t a, uint32x2_t b) { return vmax_u32(a, b); }
984[[gnu::always_inline]] nce uint32x2_t min(uint32x2_t a, uint32x2_t b) { return vmin_u32(a, b); }
985[[gnu::always_inline]] nce uint32x2_t reciprocal_estimate(uint32x2_t a) { return vrecpe_u32(a); }
986[[gnu::always_inline]] nce uint32x2_t reciprocal_sqrt_estimate(uint32x2_t a) { return vrsqrte_u32(a); }
987[[gnu::always_inline]] nce uint32x2_t pairwise_add(uint32x2_t a, uint32x2_t b) { return vpadd_u32(a, b); }
988[[gnu::always_inline]] nce uint64x1_t pairwise_add_long(uint32x2_t a) { return vpaddl_u32(a); }
989[[gnu::always_inline]] nce uint32x2_t pairwise_add_accumulate_long(uint32x2_t a, uint16x4_t b) { return vpadal_u16(a, b); }
990[[gnu::always_inline]] nce uint32x2_t pairwise_max(uint32x2_t a, uint32x2_t b) { return vpmax_u32(a, b); }
991[[gnu::always_inline]] nce uint32x2_t pairwise_min(uint32x2_t a, uint32x2_t b) { return vpmin_u32(a, b); }
992[[gnu::always_inline]] nce uint32x2_t equal(uint32x2_t a, uint32x2_t b) { return vceq_u32(a, b); }
993[[gnu::always_inline]] nce uint32x2_t greater_than_or_equal(uint32x2_t a, uint32x2_t b) { return vcge_u32(a, b); }
994[[gnu::always_inline]] nce uint32x2_t less_than_or_equal(uint32x2_t a, uint32x2_t b) { return vcle_u32(a, b); }
995[[gnu::always_inline]] nce uint32x2_t greater_than(uint32x2_t a, uint32x2_t b) { return vcgt_u32(a, b); }
996[[gnu::always_inline]] nce uint32x2_t less_than(uint32x2_t a, uint32x2_t b) { return vclt_u32(a, b); }
997[[gnu::always_inline]] nce uint32x2_t compare_test_nonzero(uint32x2_t a, uint32x2_t b) { return vtst_u32(a, b); }
998template <int n>[[gnu::always_inline]] nce uint32x2_t shift_left(uint32x2_t a) { return vshl_n_u32(a, n); }
999[[gnu::always_inline]] nce uint32x2_t shift_left_saturate(uint32x2_t a, int32x2_t b) { return vqshl_u32(a, b); }
1000template <int n>[[gnu::always_inline]] nce uint32x2_t shift_left_saturate(uint32x2_t a) { return vqshl_n_u32(a, n); }
1001[[gnu::always_inline]] nce uint32x2_t shift_left_round(uint32x2_t a, int32x2_t b) { return vrshl_u32(a, b); }
1002[[gnu::always_inline]] nce uint32x2_t shift_left_round_saturate(uint32x2_t a, int32x2_t b) { return vqrshl_u32(a, b); }
1003template <int n>[[gnu::always_inline]] nce uint64x2_t shift_left_long(uint32x2_t a) { return vshll_n_u32(a, n); }
1004template <int n>[[gnu::always_inline]] nce uint32x2_t shift_left_insert(uint32x2_t a, uint32x2_t b) { return vsli_n_u32(a, b, n); }
1005template <int n>[[gnu::always_inline]] nce uint32x2_t shift_right(uint32x2_t a) { return vshr_n_u32(a, n); }
1006template <int n>[[gnu::always_inline]] nce uint32x2_t shift_right_round(uint32x2_t a) { return vrshr_n_u32(a, n); }
1007template <int n>[[gnu::always_inline]] nce uint32x2_t shift_right_accumulate(uint32x2_t a, uint32x2_t b) { return vsra_n_u32(a, b, n); }
1008template <int n>[[gnu::always_inline]] nce uint32x2_t shift_right_accumulate_round(uint32x2_t a, uint32x2_t b) { return vrsra_n_u32(a, b, n); }
1009template <int n>[[gnu::always_inline]] nce uint32x2_t shift_right_insert(uint32x2_t a, uint32x2_t b) { return vsri_n_u32(a, b, n); }
1010template <> [[gnu::always_inline]] nce float32x2_t convert(uint32x2_t a) { return vcvt_f32_u32(a); }
1011template <int fracbits> [[gnu::always_inline]] nce float32x2_t convert_n(uint32x2_t a) { return vcvt_n_f32_u32(a, fracbits); }
1012template <> [[gnu::always_inline]] nce int8x8_t reinterpret(uint32x2_t a) { return vreinterpret_s8_u32(a); }
1013template <> [[gnu::always_inline]] nce int16x4_t reinterpret(uint32x2_t a) { return vreinterpret_s16_u32(a); }
1014template <> [[gnu::always_inline]] nce int32x2_t reinterpret(uint32x2_t a) { return vreinterpret_s32_u32(a); }
1015template <> [[gnu::always_inline]] nce float32x2_t reinterpret(uint32x2_t a) { return vreinterpret_f32_u32(a); }
1016template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(uint32x2_t a) { return vreinterpret_u8_u32(a); }
1017template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(uint32x2_t a) { return vreinterpret_u16_u32(a); }
1018template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(uint32x2_t a) { return vreinterpret_p8_u32(a); }
1019template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(uint32x2_t a) { return vreinterpret_p16_u32(a); }
1020template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(uint32x2_t a) { return vreinterpret_u64_u32(a); }
1021template <> [[gnu::always_inline]] nce int64x1_t reinterpret(uint32x2_t a) { return vreinterpret_s64_u32(a); }
1022[[gnu::always_inline]] nce uint64x2_t move_long(uint32x2_t a) { return vmovl_u32(a); }
1023template <int lane> [[gnu::always_inline]] nce uint32x2_t multiply_add_lane(uint32x2_t a, uint32x2_t b, uint32x2_t v) { return vmla_lane_u32(a, b, v, lane); }
1024template <int lane> [[gnu::always_inline]] nce uint32x2_t multiply_subtract_lane(uint32x2_t a, uint32x2_t b, uint32x2_t v) { return vmls_lane_u32(a, b, v, lane); }
1025[[gnu::always_inline]] nce uint32x2_t multiply_add(uint32x2_t a, uint32x2_t b, uint32_t c) { return vmla_n_u32(a, b, c); }
1026template <int lane> [[gnu::always_inline]] nce uint32x2_t multiply_lane(uint32x2_t a, uint32x2_t v) { return vmul_lane_u32(a, v, lane); }
1027template <int lane> [[gnu::always_inline]] nce uint64x2_t multiply_long_lane(uint32x2_t a, uint32x2_t v) { return vmull_lane_u32(a, v, lane); }
1028[[gnu::always_inline]] nce uint32x2_t multiply_subtract(uint32x2_t a, uint32x2_t b, uint32_t c) { return vmls_n_u32(a, b, c); }
1029[[gnu::always_inline]] nce uint32x2_t bitwise_not(uint32x2_t a) { return vmvn_u32(a); }
1030[[gnu::always_inline]] nce uint32x2_t bitwise_and(uint32x2_t a, uint32x2_t b) { return vand_u32(a, b); }
1031[[gnu::always_inline]] nce uint32x2_t bitwise_or(uint32x2_t a, uint32x2_t b) { return vorr_u32(a, b); }
1032[[gnu::always_inline]] nce uint32x2_t bitwise_xor(uint32x2_t a, uint32x2_t b) { return veor_u32(a, b); }
1033[[gnu::always_inline]] nce uint32x2_t bitwise_or_not(uint32x2_t a, uint32x2_t b) { return vorn_u32(a, b); }
1034#ifdef __clang__
1035[[gnu::always_inline]] nce int32x2_t count_leading_sign_bits(uint32x2_t a) { return vcls_u32(a); }
1036#endif
1037[[gnu::always_inline]] nce uint32x2_t count_leading_zero_bits(uint32x2_t a) { return vclz_u32(a); }
1038[[gnu::always_inline]] nce int32x2_t bitwise_select(uint32x2_t a, int32x2_t b, int32x2_t c) { return vbsl_s32(a, b, c); }
1039[[gnu::always_inline]] nce uint32x2_t bitwise_clear(uint32x2_t a, uint32x2_t b) { return vbic_u32(a, b); }
1040[[gnu::always_inline]] nce uint32x2_t bitwise_select(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vbsl_u32(a, b, c); }
1041template <int lane>[[gnu::always_inline]] nce uint32x2_t duplicate_lane(uint32x2_t a) { return vdup_lane_u32(a, lane); }
1042template <int lane>[[gnu::always_inline]] nce uint32x4_t duplicate_lane_quad(uint32x2_t a) { return vdupq_lane_u32(a, lane); }
1043[[gnu::always_inline]] nce uint32x4_t combine(uint32x2_t low, uint32x2_t high) { return vcombine_u32(low, high); }
1044template <int lane>[[gnu::always_inline]] nce uint32_t get_lane(uint32x2_t v) { return vget_lane_u32(v, lane); }
1045template <int n>[[gnu::always_inline]] nce uint32x2_t extract(uint32x2_t a, uint32x2_t b) { return vext_u32(a, b, n); }
1046[[gnu::always_inline]] nce uint32x2_t reverse_64bit(uint32x2_t a) { return vrev64_u32(a); }
1047[[gnu::always_inline]] nce uint32x2x2_t zip(uint32x2_t a, uint32x2_t b) { return vzip_u32(a, b); }
1048[[gnu::always_inline]] nce uint32x2x2_t unzip(uint32x2_t a, uint32x2_t b) { return vuzp_u32(a, b); }
1049[[gnu::always_inline]] nce uint32x2x2_t transpose(uint32x2_t a, uint32x2_t b) { return vtrn_u32(a, b); }
1050[[gnu::always_inline]] nce float32x2_t bitwise_select(uint32x2_t a, float32x2_t b, float32x2_t c) { return vbsl_f32(a, b, c); }
1051[[gnu::always_inline]] nce uint32x2_t multiply(uint32x2_t a, uint32_t b) { return vmul_n_u32(a, b); }
1052[[gnu::always_inline]] nce uint64x2_t multiply_long(uint32x2_t a, uint32_t b) { return vmull_n_u32(a, b); }
1053[[gnu::always_inline]] nce uint32x4_t add(uint32x4_t a, uint16x4_t b) { return vaddw_u16(a, b); }
1054[[gnu::always_inline]] nce uint32x4_t multiply_add_long(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlal_u16(a, b, c); }
1055[[gnu::always_inline]] nce uint32x4_t multiply_subtract_long(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlsl_u16(a, b, c); }
1056[[gnu::always_inline]] nce uint32x4_t subtract(uint32x4_t a, uint16x4_t b) { return vsubw_u16(a, b); }
1057[[gnu::always_inline]] nce uint32x4_t subtract_absolute_add(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vabal_u16(a, b, c); }
1058[[gnu::always_inline]] nce uint16x4_t add_narrow(uint32x4_t a, uint32x4_t b) { return vaddhn_u32(a, b); }
1059[[gnu::always_inline]] nce uint16x4_t add_round_narrow(uint32x4_t a, uint32x4_t b) { return vraddhn_u32(a, b); }
1060[[gnu::always_inline]] nce uint32x4_t multiply_add(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vmlaq_u32(a, b, c); }
1061[[gnu::always_inline]] nce uint32x4_t multiply_subtract(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vmlsq_u32(a, b, c); }
1062[[gnu::always_inline]] nce uint16x4_t subtract_narrow(uint32x4_t a, uint32x4_t b) { return vsubhn_u32(a, b); }
1063[[gnu::always_inline]] nce uint16x4_t subtract_round_narrow(uint32x4_t a, uint32x4_t b) { return vrsubhn_u32(a, b); }
1064[[gnu::always_inline]] nce uint32x4_t subtract_absolute_add(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vabaq_u32(a, b, c); }
1065[[gnu::always_inline]] nce uint32x4_t reciprocal_estimate(uint32x4_t a) { return vrecpeq_u32(a); }
1066[[gnu::always_inline]] nce uint32x4_t reciprocal_sqrt_estimate(uint32x4_t a) { return vrsqrteq_u32(a); }
1067[[gnu::always_inline]] nce uint64x2_t pairwise_add_long(uint32x4_t a) { return vpaddlq_u32(a); }
1068[[gnu::always_inline]] nce uint32x4_t pairwise_add_accumulate_long(uint32x4_t a, uint16x8_t b) { return vpadalq_u16(a, b); }
1069[[gnu::always_inline]] nce uint32x4_t equal(uint32x4_t a, uint32x4_t b) { return vceqq_u32(a, b); }
1070[[gnu::always_inline]] nce uint32x4_t greater_than_or_equal(uint32x4_t a, uint32x4_t b) { return vcgeq_u32(a, b); }
1071[[gnu::always_inline]] nce uint32x4_t less_than_or_equal(uint32x4_t a, uint32x4_t b) { return vcleq_u32(a, b); }
1072[[gnu::always_inline]] nce uint32x4_t greater_than(uint32x4_t a, uint32x4_t b) { return vcgtq_u32(a, b); }
1073[[gnu::always_inline]] nce uint32x4_t less_than(uint32x4_t a, uint32x4_t b) { return vcltq_u32(a, b); }
1074[[gnu::always_inline]] nce uint32x4_t compare_test_nonzero(uint32x4_t a, uint32x4_t b) { return vtstq_u32(a, b); }
1075template <int n>[[gnu::always_inline]] nce uint32x4_t shift_left(uint32x4_t a) { return vshlq_n_u32(a, n); }
1076template <int n>[[gnu::always_inline]] nce uint32x4_t shift_right_accumulate(uint32x4_t a, uint32x4_t b) { return vsraq_n_u32(a, b, n); }
1077template <int n>[[gnu::always_inline]] nce uint32x4_t shift_right_accumulate_round(uint32x4_t a, uint32x4_t b) { return vrsraq_n_u32(a, b, n); }
1078template <int n>[[gnu::always_inline]] nce uint16x4_t shift_right_narrow(uint32x4_t a) { return vshrn_n_u32(a, n); }
1079template <int n>[[gnu::always_inline]] nce uint16x4_t shift_right_saturate_narrow(uint32x4_t a) { return vqshrn_n_u32(a, n); }
1080template <int n>[[gnu::always_inline]] nce uint16x4_t shift_right_round_saturate_narrow(uint32x4_t a) { return vqrshrn_n_u32(a, n); }
1081template <int n>[[gnu::always_inline]] nce uint16x4_t shift_right_round_narrow(uint32x4_t a) { return vrshrn_n_u32(a, n); }
1082template <int fracbits> [[gnu::always_inline]] nce float32x4_t convert_n(uint32x4_t a) { return vcvtq_n_f32_u32(a, fracbits); }
1083template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(uint32x4_t a) { return vreinterpretq_p8_u32(a); }
1084template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(uint32x4_t a) { return vreinterpretq_p16_u32(a); }
1085[[gnu::always_inline]] nce uint16x4_t move_narrow(uint32x4_t a) { return vmovn_u32(a); }
1086[[gnu::always_inline]] nce uint16x4_t move_saturate_narrow(uint32x4_t a) { return vqmovn_u32(a); }
1087template <int lane> [[gnu::always_inline]] nce uint32x4_t multiply_add_long_lane(uint32x4_t a, uint16x4_t b, uint16x4_t v) { return vmlal_lane_u16(a, b, v, lane); }
1088template <int lane> [[gnu::always_inline]] nce uint32x4_t multiply_subtract_long_lane(uint32x4_t a, uint16x4_t b, uint16x4_t v) { return vmlsl_lane_u16(a, b, v, lane); }
1089[[gnu::always_inline]] nce uint32x4_t multiply_add_long(uint32x4_t a, uint16x4_t b, uint16_t c) { return vmlal_n_u16(a, b, c); }
1090[[gnu::always_inline]] nce uint32x4_t multiply_subtract_long(uint32x4_t a, uint16x4_t b, uint16_t c) { return vmlsl_n_u16(a, b, c); }
1091template <int lane> [[gnu::always_inline]] nce uint32x4_t multiply_lane(uint32x4_t a, uint32x2_t v) { return vmulq_lane_u32(a, v, lane); }
1092template <int lane> [[gnu::always_inline]] nce uint32x4_t multiply_add_lane(uint32x4_t a, uint32x4_t b, uint32x2_t v) { return vmlaq_lane_u32(a, b, v, lane); }
1093template <int lane> [[gnu::always_inline]] nce uint32x4_t multiply_subtract_lane(uint32x4_t a, uint32x4_t b, uint32x2_t v) { return vmlsq_lane_u32(a, b, v, lane); }
1094[[gnu::always_inline]] nce uint32x4_t multiply_add(uint32x4_t a, uint32x4_t b, uint32_t c) { return vmlaq_n_u32(a, b, c); }
1095[[gnu::always_inline]] nce uint32x4_t multiply_subtract(uint32x4_t a, uint32x4_t b, uint32_t c) { return vmlsq_n_u32(a, b, c); }
1096#ifdef __clang__
1097[[gnu::always_inline]] nce int32x4_t count_leading_sign_bits(uint32x4_t a) { return vclsq_u32(a); }
1098#endif
1099[[gnu::always_inline]] nce int32x4_t bitwise_select(uint32x4_t a, int32x4_t b, int32x4_t c) { return vbslq_s32(a, b, c); }
1100[[gnu::always_inline]] nce uint32x4_t bitwise_select(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vbslq_u32(a, b, c); }
1101[[gnu::always_inline]] nce uint32x2_t get_high(uint32x4_t a) { return vget_high_u32(a); }
1102[[gnu::always_inline]] nce uint32x2_t get_low(uint32x4_t a) { return vget_low_u32(a); }
1103template <int n>[[gnu::always_inline]] nce uint32x4_t extract(uint32x4_t a, uint32x4_t b) { return vextq_u32(a, b, n); }
1104[[gnu::always_inline]] nce uint32x4x2_t zip(uint32x4_t a, uint32x4_t b) { return vzipq_u32(a, b); }
1105[[gnu::always_inline]] nce uint32x4x2_t unzip(uint32x4_t a, uint32x4_t b) { return vuzpq_u32(a, b); }
1106[[gnu::always_inline]] nce uint32x4x2_t transpose(uint32x4_t a, uint32x4_t b) { return vtrnq_u32(a, b); }
1107[[gnu::always_inline]] nce float32x4_t bitwise_select(uint32x4_t a, float32x4_t b, float32x4_t c) { return vbslq_f32(a, b, c); }
1108[[gnu::always_inline]] nce float32x2_t add(float32x2_t a, float32x2_t b) { return vadd_f32(a, b); }
1109[[gnu::always_inline]] nce float32x2_t multiply(float32x2_t a, float32x2_t b) { return vmul_f32(a, b); }
1110[[gnu::always_inline]] nce float32x2_t multiply_add(float32x2_t a, float32x2_t b, float32x2_t c) { return vmla_f32(a, b, c); }
1111[[gnu::always_inline]] nce float32x2_t multiply_subtract(float32x2_t a, float32x2_t b, float32x2_t c) { return vmls_f32(a, b, c); }
1112[[gnu::always_inline]] nce float32x2_t subtract(float32x2_t a, float32x2_t b) { return vsub_f32(a, b); }
1113[[gnu::always_inline]] nce float32x2_t subtract_absolute(float32x2_t a, float32x2_t b) { return vabd_f32(a, b); }
1114[[gnu::always_inline]] nce float32x2_t absolute(float32x2_t a) { return vabs_f32(a); }
1115[[gnu::always_inline]] nce float32x2_t max(float32x2_t a, float32x2_t b) { return vmax_f32(a, b); }
1116[[gnu::always_inline]] nce float32x2_t min(float32x2_t a, float32x2_t b) { return vmin_f32(a, b); }
1117[[gnu::always_inline]] nce float32x2_t reciprocal_estimate(float32x2_t a) { return vrecpe_f32(a); }
1118[[gnu::always_inline]] nce float32x2_t reciprocal_step(float32x2_t a, float32x2_t b) { return vrecps_f32(a, b); }
1119[[gnu::always_inline]] nce float32x2_t reciprocal_sqrt_estimate(float32x2_t a) { return vrsqrte_f32(a); }
1120[[gnu::always_inline]] nce float32x2_t reciprocal_sqrt_step(float32x2_t a, float32x2_t b) { return vrsqrts_f32(a, b); }
1121[[gnu::always_inline]] nce float32x2_t pairwise_add(float32x2_t a, float32x2_t b) { return vpadd_f32(a, b); }
1122[[gnu::always_inline]] nce float32x2_t pairwise_max(float32x2_t a, float32x2_t b) { return vpmax_f32(a, b); }
1123[[gnu::always_inline]] nce float32x2_t pairwise_min(float32x2_t a, float32x2_t b) { return vpmin_f32(a, b); }
1124[[gnu::always_inline]] nce uint32x2_t equal(float32x2_t a, float32x2_t b) { return vceq_f32(a, b); }
1125[[gnu::always_inline]] nce uint32x2_t greater_than_or_equal(float32x2_t a, float32x2_t b) { return vcge_f32(a, b); }
1126[[gnu::always_inline]] nce uint32x2_t less_than_or_equal(float32x2_t a, float32x2_t b) { return vcle_f32(a, b); }
1127[[gnu::always_inline]] nce uint32x2_t greater_than(float32x2_t a, float32x2_t b) { return vcgt_f32(a, b); }
1128[[gnu::always_inline]] nce uint32x2_t less_than(float32x2_t a, float32x2_t b) { return vclt_f32(a, b); }
1129[[gnu::always_inline]] nce uint32x2_t absolute_greater_than_or_equal(float32x2_t a, float32x2_t b) { return vcage_f32(a, b); }
1130[[gnu::always_inline]] nce uint32x2_t absolute_less_than_or_equal(float32x2_t a, float32x2_t b) { return vcale_f32(a, b); }
1131[[gnu::always_inline]] nce uint32x2_t absolute_greater_than(float32x2_t a, float32x2_t b) { return vcagt_f32(a, b); }
1132[[gnu::always_inline]] nce uint32x2_t absolute_less_than(float32x2_t a, float32x2_t b) { return vcalt_f32(a, b); }
1133template <> [[gnu::always_inline]] nce int32x2_t convert(float32x2_t a) { return vcvt_s32_f32(a); }
1134template <> [[gnu::always_inline]] nce uint32x2_t convert(float32x2_t a) { return vcvt_u32_f32(a); }
1135template <int fracbits> [[gnu::always_inline]] nce int32x2_t convert_n_signed(float32x2_t a) { return vcvt_n_s32_f32(a, fracbits); }
1136template <int fracbits> [[gnu::always_inline]] nce uint32x2_t convert_n_unsigned(float32x2_t a) { return vcvt_n_u32_f32(a, fracbits); }
1137template <> [[gnu::always_inline]] nce int8x8_t reinterpret(float32x2_t a) { return vreinterpret_s8_f32(a); }
1138template <> [[gnu::always_inline]] nce int16x4_t reinterpret(float32x2_t a) { return vreinterpret_s16_f32(a); }
1139template <> [[gnu::always_inline]] nce int32x2_t reinterpret(float32x2_t a) { return vreinterpret_s32_f32(a); }
1140template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(float32x2_t a) { return vreinterpret_u8_f32(a); }
1141template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(float32x2_t a) { return vreinterpret_u16_f32(a); }
1142template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(float32x2_t a) { return vreinterpret_u32_f32(a); }
1143template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(float32x2_t a) { return vreinterpret_p8_f32(a); }
1144template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(float32x2_t a) { return vreinterpret_p16_f32(a); }
1145template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(float32x2_t a) { return vreinterpret_u64_f32(a); }
1146template <> [[gnu::always_inline]] nce int64x1_t reinterpret(float32x2_t a) { return vreinterpret_s64_f32(a); }
1147template <int lane> [[gnu::always_inline]] nce float32x2_t multiply_add_lane(float32x2_t a, float32x2_t b, float32x2_t v) { return vmla_lane_f32(a, b, v, lane); }
1148template <int lane> [[gnu::always_inline]] nce float32x2_t multiply_subtract_lane(float32x2_t a, float32x2_t b, float32x2_t v) { return vmls_lane_f32(a, b, v, lane); }
1149[[gnu::always_inline]] nce float32x2_t multiply_add(float32x2_t a, float32x2_t b, float32_t c) { return vmla_n_f32(a, b, c); }
1150template <int lane> [[gnu::always_inline]] nce float32x2_t multiply_lane(float32x2_t a, float32x2_t v) { return vmul_lane_f32(a, v, lane); }
1151[[gnu::always_inline]] nce float32x2_t multiply_subtract(float32x2_t a, float32x2_t b, float32_t c) { return vmls_n_f32(a, b, c); }
1152template <int lane>[[gnu::always_inline]] nce float32x2_t duplicate_lane(float32x2_t a) { return vdup_lane_f32(a, lane); }
1153template <int lane>[[gnu::always_inline]] nce float32x4_t duplicate_lane_quad(float32x2_t a) { return vdupq_lane_f32(a, lane); }
1154[[gnu::always_inline]] nce float32x4_t combine(float32x2_t low, float32x2_t high) { return vcombine_f32(low, high); }
1155template <int lane>[[gnu::always_inline]] nce float32_t get_lane(float32x2_t v) { return vget_lane_f32(v, lane); }
1156template <int n>[[gnu::always_inline]] nce float32x2_t extract(float32x2_t a, float32x2_t b) { return vext_f32(a, b, n); }
1157[[gnu::always_inline]] nce float32x2_t reverse_64bit(float32x2_t a) { return vrev64_f32(a); }
1158[[gnu::always_inline]] nce float32x2x2_t zip(float32x2_t a, float32x2_t b) { return vzip_f32(a, b); }
1159[[gnu::always_inline]] nce float32x2x2_t unzip(float32x2_t a, float32x2_t b) { return vuzp_f32(a, b); }
1160[[gnu::always_inline]] nce float32x2x2_t transpose(float32x2_t a, float32x2_t b) { return vtrn_f32(a, b); }
1161[[gnu::always_inline]] nce float32x2_t multiply(float32x2_t a, float32_t b) { return vmul_n_f32(a, b); }
1162[[gnu::always_inline]] nce float32x2_t negate(float32x2_t a) { return vneg_f32(a); }
1163[[gnu::always_inline]] nce float32x4_t multiply_add(float32x4_t a, float32x4_t b, float32x4_t c) { return vmlaq_f32(a, b, c); }
1164[[gnu::always_inline]] nce float32x4_t multiply_subtract(float32x4_t a, float32x4_t b, float32x4_t c) { return vmlsq_f32(a, b, c); }
1165[[gnu::always_inline]] nce float32x4_t max(float32x4_t a, float32x4_t b) { return vmaxq_f32(a, b); }
1166[[gnu::always_inline]] nce float32x4_t min(float32x4_t a, float32x4_t b) { return vminq_f32(a, b); }
1167[[gnu::always_inline]] nce float32x4_t reciprocal_estimate(float32x4_t a) { return vrecpeq_f32(a); }
1168[[gnu::always_inline]] nce float32x4_t reciprocal_step(float32x4_t a, float32x4_t b) { return vrecpsq_f32(a, b); }
1169[[gnu::always_inline]] nce float32x4_t reciprocal_sqrt_estimate(float32x4_t a) { return vrsqrteq_f32(a); }
1170[[gnu::always_inline]] nce float32x4_t reciprocal_sqrt_step(float32x4_t a, float32x4_t b) { return vrsqrtsq_f32(a, b); }
1171[[gnu::always_inline]] nce uint32x4_t equal(float32x4_t a, float32x4_t b) { return vceqq_f32(a, b); }
1172[[gnu::always_inline]] nce uint32x4_t greater_than_or_equal(float32x4_t a, float32x4_t b) { return vcgeq_f32(a, b); }
1173[[gnu::always_inline]] nce uint32x4_t less_than_or_equal(float32x4_t a, float32x4_t b) { return vcleq_f32(a, b); }
1174[[gnu::always_inline]] nce uint32x4_t greater_than(float32x4_t a, float32x4_t b) { return vcgtq_f32(a, b); }
1175[[gnu::always_inline]] nce uint32x4_t less_than(float32x4_t a, float32x4_t b) { return vcltq_f32(a, b); }
1176[[gnu::always_inline]] nce uint32x4_t absolute_greater_than_or_equal(float32x4_t a, float32x4_t b) { return vcageq_f32(a, b); }
1177[[gnu::always_inline]] nce uint32x4_t absolute_less_than_or_equal(float32x4_t a, float32x4_t b) { return vcaleq_f32(a, b); }
1178[[gnu::always_inline]] nce uint32x4_t absolute_greater_than(float32x4_t a, float32x4_t b) { return vcagtq_f32(a, b); }
1179[[gnu::always_inline]] nce uint32x4_t absolute_less_than(float32x4_t a, float32x4_t b) { return vcaltq_f32(a, b); }
1180template <int fracbits> [[gnu::always_inline]] nce int32x4_t convert_n_signed(float32x4_t a) { return vcvtq_n_s32_f32(a, fracbits); }
1181template <int fracbits> [[gnu::always_inline]] nce uint32x4_t convert_n_unsigned(float32x4_t a) { return vcvtq_n_u32_f32(a, fracbits); }
1182template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(float32x4_t a) { return vreinterpretq_p8_f32(a); }
1183template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(float32x4_t a) { return vreinterpretq_p16_f32(a); }
1184template <int lane> [[gnu::always_inline]] nce float32x4_t multiply_lane(float32x4_t a, float32x2_t v) { return vmulq_lane_f32(a, v, lane); }
1185template <int lane> [[gnu::always_inline]] nce float32x4_t multiply_add_lane(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlaq_lane_f32(a, b, v, lane); }
1186template <int lane> [[gnu::always_inline]] nce float32x4_t multiply_subtract_lane(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlsq_lane_f32(a, b, v, lane); }
1187[[gnu::always_inline]] nce float32x4_t multiply_add(float32x4_t a, float32x4_t b, float32_t c) { return vmlaq_n_f32(a, b, c); }
1188[[gnu::always_inline]] nce float32x4_t multiply_subtract(float32x4_t a, float32x4_t b, float32_t c) { return vmlsq_n_f32(a, b, c); }
1189[[gnu::always_inline]] nce float32x2_t get_high(float32x4_t a) { return vget_high_f32(a); }
1190[[gnu::always_inline]] nce float32x2_t get_low(float32x4_t a) { return vget_low_f32(a); }
1191template <int n>[[gnu::always_inline]] nce float32x4_t extract(float32x4_t a, float32x4_t b) { return vextq_f32(a, b, n); }
1192[[gnu::always_inline]] nce float32x4x2_t zip(float32x4_t a, float32x4_t b) { return vzipq_f32(a, b); }
1193[[gnu::always_inline]] nce float32x4x2_t unzip(float32x4_t a, float32x4_t b) { return vuzpq_f32(a, b); }
1194[[gnu::always_inline]] nce float32x4x2_t transpose(float32x4_t a, float32x4_t b) { return vtrnq_f32(a, b); }
1195[[gnu::always_inline]] nce poly8x8_t multiply(poly8x8_t a, poly8x8_t b) { return vmul_p8(a, b); }
1196[[gnu::always_inline]] nce poly16x8_t multiply_long(poly8x8_t a, poly8x8_t b) { return vmull_p8(a, b); }
1197[[gnu::always_inline]] nce uint8x8_t equal(poly8x8_t a, poly8x8_t b) { return vceq_p8(a, b); }
1198[[gnu::always_inline]] nce uint8x8_t compare_test_nonzero(poly8x8_t a, poly8x8_t b) { return vtst_p8(a, b); }
1199template <int n>[[gnu::always_inline]] nce poly8x8_t shift_left_insert(poly8x8_t a, poly8x8_t b) { return vsli_n_p8(a, b, n); }
1200template <int n>[[gnu::always_inline]] nce poly8x8_t shift_right_insert(poly8x8_t a, poly8x8_t b) { return vsri_n_p8(a, b, n); }
1201template <> [[gnu::always_inline]] nce int8x8_t reinterpret(poly8x8_t a) { return vreinterpret_s8_p8(a); }
1202template <> [[gnu::always_inline]] nce int16x4_t reinterpret(poly8x8_t a) { return vreinterpret_s16_p8(a); }
1203template <> [[gnu::always_inline]] nce int32x2_t reinterpret(poly8x8_t a) { return vreinterpret_s32_p8(a); }
1204template <> [[gnu::always_inline]] nce float32x2_t reinterpret(poly8x8_t a) { return vreinterpret_f32_p8(a); }
1205template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(poly8x8_t a) { return vreinterpret_u8_p8(a); }
1206template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(poly8x8_t a) { return vreinterpret_u16_p8(a); }
1207template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(poly8x8_t a) { return vreinterpret_u32_p8(a); }
1208template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(poly8x8_t a) { return vreinterpret_p16_p8(a); }
1209template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(poly8x8_t a) { return vreinterpret_u64_p8(a); }
1210template <> [[gnu::always_inline]] nce int64x1_t reinterpret(poly8x8_t a) { return vreinterpret_s64_p8(a); }
1211[[gnu::always_inline]] nce poly8x8_t bitwise_not(poly8x8_t a) { return vmvn_p8(a); }
1212[[gnu::always_inline]] nce poly8x8_t count_active_bits(poly8x8_t a) { return vcnt_p8(a); }
1213template <int lane>[[gnu::always_inline]] nce poly8x8_t duplicate_lane(poly8x8_t a) { return vdup_lane_p8(a, lane); }
1214template <int lane>[[gnu::always_inline]] nce poly8x16_t duplicate_lane_quad(poly8x8_t a) { return vdupq_lane_p8(a, lane); }
1215[[gnu::always_inline]] nce poly8x16_t combine(poly8x8_t low, poly8x8_t high) { return vcombine_p8(low, high); }
1216template <int lane>[[gnu::always_inline]] nce poly8_t get_lane(poly8x8_t v) { return vget_lane_p8(v, lane); }
1217template <int n>[[gnu::always_inline]] nce poly8x8_t extract(poly8x8_t a, poly8x8_t b) { return vext_p8(a, b, n); }
1218[[gnu::always_inline]] nce poly8x8_t reverse_64bit(poly8x8_t a) { return vrev64_p8(a); }
1219[[gnu::always_inline]] nce poly8x8_t reverse_32bit(poly8x8_t a) { return vrev32_p8(a); }
1220[[gnu::always_inline]] nce poly8x8_t reverse_16bit(poly8x8_t a) { return vrev16_p8(a); }
1221[[gnu::always_inline]] nce poly8x8_t table_lookup1(poly8x8_t a, uint8x8_t idx) { return vtbl1_p8(a, idx); }
1222[[gnu::always_inline]] nce poly8x8x2_t zip(poly8x8_t a, poly8x8_t b) { return vzip_p8(a, b); }
1223[[gnu::always_inline]] nce poly8x8x2_t unzip(poly8x8_t a, poly8x8_t b) { return vuzp_p8(a, b); }
1224[[gnu::always_inline]] nce poly8x8x2_t transpose(poly8x8_t a, poly8x8_t b) { return vtrn_p8(a, b); }
1225[[gnu::always_inline]] nce poly8x8_t table_extension1(poly8x8_t a, poly8x8_t b, uint8x8_t idx) { return vtbx1_p8(a, b, idx); }
1226#ifdef __clang__
1227[[gnu::always_inline]] nce poly8x8_t add(poly8x8_t a, poly8x8_t b) { return vadd_p8(a, b); }
1228#endif
1229[[gnu::always_inline]] nce poly8x8_t table_extension2(poly8x8_t a, poly8x8x2_t b, uint8x8_t idx) { return vtbx2_p8(a, b, idx); }
1230[[gnu::always_inline]] nce poly8x8_t table_extension3(poly8x8_t a, poly8x8x3_t b, uint8x8_t idx) { return vtbx3_p8(a, b, idx); }
1231[[gnu::always_inline]] nce poly8x8_t table_extension4(poly8x8_t a, poly8x8x4_t b, uint8x8_t idx) { return vtbx4_p8(a, b, idx); }
1232template <int n>[[gnu::always_inline]] nce poly16x4_t shift_left_insert(poly16x4_t a, poly16x4_t b) { return vsli_n_p16(a, b, n); }
1233template <int n>[[gnu::always_inline]] nce poly16x4_t shift_right_insert(poly16x4_t a, poly16x4_t b) { return vsri_n_p16(a, b, n); }
1234template <> [[gnu::always_inline]] nce int8x8_t reinterpret(poly16x4_t a) { return vreinterpret_s8_p16(a); }
1235template <> [[gnu::always_inline]] nce int16x4_t reinterpret(poly16x4_t a) { return vreinterpret_s16_p16(a); }
1236template <> [[gnu::always_inline]] nce int32x2_t reinterpret(poly16x4_t a) { return vreinterpret_s32_p16(a); }
1237template <> [[gnu::always_inline]] nce float32x2_t reinterpret(poly16x4_t a) { return vreinterpret_f32_p16(a); }
1238template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(poly16x4_t a) { return vreinterpret_u8_p16(a); }
1239template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(poly16x4_t a) { return vreinterpret_u16_p16(a); }
1240template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(poly16x4_t a) { return vreinterpret_u32_p16(a); }
1241template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(poly16x4_t a) { return vreinterpret_p8_p16(a); }
1242template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(poly16x4_t a) { return vreinterpret_u64_p16(a); }
1243template <> [[gnu::always_inline]] nce int64x1_t reinterpret(poly16x4_t a) { return vreinterpret_s64_p16(a); }
1244template <int lane>[[gnu::always_inline]] nce poly16x4_t duplicate_lane(poly16x4_t a) { return vdup_lane_p16(a, lane); }
1245template <int lane>[[gnu::always_inline]] nce poly16x8_t duplicate_lane_quad(poly16x4_t a) { return vdupq_lane_p16(a, lane); }
1246[[gnu::always_inline]] nce poly16x8_t combine(poly16x4_t low, poly16x4_t high) { return vcombine_p16(low, high); }
1247template <int lane>[[gnu::always_inline]] nce poly16_t get_lane(poly16x4_t v) { return vget_lane_p16(v, lane); }
1248template <int n>[[gnu::always_inline]] nce poly16x4_t extract(poly16x4_t a, poly16x4_t b) { return vext_p16(a, b, n); }
1249[[gnu::always_inline]] nce poly16x4_t reverse_64bit(poly16x4_t a) { return vrev64_p16(a); }
1250[[gnu::always_inline]] nce poly16x4_t reverse_32bit(poly16x4_t a) { return vrev32_p16(a); }
1251[[gnu::always_inline]] nce poly16x4x2_t zip(poly16x4_t a, poly16x4_t b) { return vzip_p16(a, b); }
1252[[gnu::always_inline]] nce poly16x4x2_t unzip(poly16x4_t a, poly16x4_t b) { return vuzp_p16(a, b); }
1253[[gnu::always_inline]] nce poly16x4x2_t transpose(poly16x4_t a, poly16x4_t b) { return vtrn_p16(a, b); }
1254#ifdef __clang__
1255[[gnu::always_inline]] nce poly16x4_t add(poly16x4_t a, poly16x4_t b) { return vadd_p16(a, b); }
1256#endif
1257[[gnu::always_inline]] nce int64x1_t add(int64x1_t a, int64x1_t b) { return vadd_s64(a, b); }
1258[[gnu::always_inline]] nce int64x2_t add(int64x2_t a, int64x2_t b) { return vaddq_s64(a, b); }
1259[[gnu::always_inline]] nce int64x2_t add(int64x2_t a, int32x2_t b) { return vaddw_s32(a, b); }
1260[[gnu::always_inline]] nce int32x2_t add_narrow(int64x2_t a, int64x2_t b) { return vaddhn_s64(a, b); }
1261[[gnu::always_inline]] nce int32x2_t add_round_narrow(int64x2_t a, int64x2_t b) { return vraddhn_s64(a, b); }
1262[[gnu::always_inline]] nce int64x1_t add_saturate(int64x1_t a, int64x1_t b) { return vqadd_s64(a, b); }
1263[[gnu::always_inline]] nce int64x2_t add_saturate(int64x2_t a, int64x2_t b) { return vqaddq_s64(a, b); }
1264[[gnu::always_inline]] nce int64x2_t multiply_add_long(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlal_s32(a, b, c); }
1265[[gnu::always_inline]] nce int64x2_t multiply_subtract_long(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlsl_s32(a, b, c); }
1266[[gnu::always_inline]] nce int64x2_t multiply_double_add_saturate_long(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlal_s32(a, b, c); }
1267[[gnu::always_inline]] nce int64x2_t multiply_double_subtract_saturate_long(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlsl_s32(a, b, c); }
1268template <int lane> [[gnu::always_inline]] nce int64x2_t multiply_double_add_saturate_long_lane(int64x2_t a, int32x2_t b, int32x2_t v) { return vqdmlal_lane_s32(a, b, v, lane); }
1269template <int lane> [[gnu::always_inline]] nce int64x2_t multiply_double_subtract_saturate_long_lane(int64x2_t a, int32x2_t b, int32x2_t v) { return vqdmlsl_lane_s32(a, b, v, lane); }
1270[[gnu::always_inline]] nce int64x2_t multiply_double_add_saturate_long(int64x2_t a, int32x2_t b, int32_t c) { return vqdmlal_n_s32(a, b, c); }
1271[[gnu::always_inline]] nce int64x2_t multiply_double_subtract_saturate_long(int64x2_t a, int32x2_t b, int32_t c) { return vqdmlsl_n_s32(a, b, c); }
1272[[gnu::always_inline]] nce poly8x16_t multiply(poly8x16_t a, poly8x16_t b) { return vmulq_p8(a, b); }
1273[[gnu::always_inline]] nce int64x1_t subtract(int64x1_t a, int64x1_t b) { return vsub_s64(a, b); }
1274[[gnu::always_inline]] nce int64x2_t subtract(int64x2_t a, int64x2_t b) { return vsubq_s64(a, b); }
1275[[gnu::always_inline]] nce int64x2_t subtract(int64x2_t a, int32x2_t b) { return vsubw_s32(a, b); }
1276[[gnu::always_inline]] nce int32x2_t subtract_narrow(int64x2_t a, int64x2_t b) { return vsubhn_s64(a, b); }
1277[[gnu::always_inline]] nce int32x2_t subtract_round_narrow(int64x2_t a, int64x2_t b) { return vrsubhn_s64(a, b); }
1278[[gnu::always_inline]] nce int64x1_t subtract_saturate(int64x1_t a, int64x1_t b) { return vqsub_s64(a, b); }
1279[[gnu::always_inline]] nce int64x2_t subtract_saturate(int64x2_t a, int64x2_t b) { return vqsubq_s64(a, b); }
1280[[gnu::always_inline]] nce int64x2_t subtract_absolute_add(int64x2_t a, int32x2_t b, int32x2_t c) { return vabal_s32(a, b, c); }
1281[[gnu::always_inline]] nce int64x1_t pairwise_add_accumulate_long(int64x1_t a, int32x2_t b) { return vpadal_s32(a, b); }
1282[[gnu::always_inline]] nce int64x2_t pairwise_add_accumulate_long(int64x2_t a, int32x4_t b) { return vpadalq_s32(a, b); }
1283[[gnu::always_inline]] nce uint8x16_t equal(poly8x16_t a, poly8x16_t b) { return vceqq_p8(a, b); }
1284[[gnu::always_inline]] nce uint8x16_t compare_test_nonzero(poly8x16_t a, poly8x16_t b) { return vtstq_p8(a, b); }
1285[[gnu::always_inline]] nce int64x1_t shift_left(int64x1_t a, int64x1_t b) { return vshl_s64(a, b); }
1286[[gnu::always_inline]] nce int64x2_t shift_left(int64x2_t a, int64x2_t b) { return vshlq_s64(a, b); }
1287template <int n>[[gnu::always_inline]] nce int64x1_t shift_left(int64x1_t a) { return vshl_n_s64(a, n); }
1288template <int n>[[gnu::always_inline]] nce int64x2_t shift_left(int64x2_t a) { return vshlq_n_s64(a, n); }
1289[[gnu::always_inline]] nce int64x1_t shift_left_saturate(int64x1_t a, int64x1_t b) { return vqshl_s64(a, b); }
1290[[gnu::always_inline]] nce int64x2_t shift_left_saturate(int64x2_t a, int64x2_t b) { return vqshlq_s64(a, b); }
1291template <int n>[[gnu::always_inline]] nce int64x1_t shift_left_saturate(int64x1_t a) { return vqshl_n_s64(a, n); }
1292template <int n>[[gnu::always_inline]] nce int64x2_t shift_left_saturate(int64x2_t a) { return vqshlq_n_s64(a, n); }
1293template <int n>[[gnu::always_inline]] nce uint64x1_t shift_left_unsigned_saturate(int64x1_t a) { return vqshlu_n_s64(a, n); }
1294template <int n>[[gnu::always_inline]] nce uint64x2_t shift_left_unsigned_saturate(int64x2_t a) { return vqshluq_n_s64(a, n); }
1295[[gnu::always_inline]] nce int64x1_t shift_left_round(int64x1_t a, int64x1_t b) { return vrshl_s64(a, b); }
1296[[gnu::always_inline]] nce int64x2_t shift_left_round(int64x2_t a, int64x2_t b) { return vrshlq_s64(a, b); }
1297[[gnu::always_inline]] nce int64x1_t shift_left_round_saturate(int64x1_t a, int64x1_t b) { return vqrshl_s64(a, b); }
1298[[gnu::always_inline]] nce int64x2_t shift_left_round_saturate(int64x2_t a, int64x2_t b) { return vqrshlq_s64(a, b); }
1299template <int n>[[gnu::always_inline]] nce int64x1_t shift_left_insert(int64x1_t a, int64x1_t b) { return vsli_n_s64(a, b, n); }
1300template <int n>[[gnu::always_inline]] nce int64x2_t shift_left_insert(int64x2_t a, int64x2_t b) { return vsliq_n_s64(a, b, n); }
1301template <int n>[[gnu::always_inline]] nce poly8x16_t shift_left_insert(poly8x16_t a, poly8x16_t b) { return vsliq_n_p8(a, b, n); }
1302template <int n>[[gnu::always_inline]] nce poly16x8_t shift_left_insert(poly16x8_t a, poly16x8_t b) { return vsliq_n_p16(a, b, n); }
1303template <int n>[[gnu::always_inline]] nce int64x1_t shift_right(int64x1_t a) { return vshr_n_s64(a, n); }
1304template <int n>[[gnu::always_inline]] nce int64x2_t shift_right(int64x2_t a) { return vshrq_n_s64(a, n); }
1305template <int n>[[gnu::always_inline]] nce int64x1_t shift_right_round(int64x1_t a) { return vrshr_n_s64(a, n); }
1306template <int n>[[gnu::always_inline]] nce int64x2_t shift_right_round(int64x2_t a) { return vrshrq_n_s64(a, n); }
1307template <int n>[[gnu::always_inline]] nce int64x1_t shift_right_accumulate(int64x1_t a, int64x1_t b) { return vsra_n_s64(a, b, n); }
1308template <int n>[[gnu::always_inline]] nce int64x2_t shift_right_accumulate(int64x2_t a, int64x2_t b) { return vsraq_n_s64(a, b, n); }
1309template <int n>[[gnu::always_inline]] nce int64x1_t shift_right_accumulate_round(int64x1_t a, int64x1_t b) { return vrsra_n_s64(a, b, n); }
1310template <int n>[[gnu::always_inline]] nce int64x2_t shift_right_accumulate_round(int64x2_t a, int64x2_t b) { return vrsraq_n_s64(a, b, n); }
1311template <int n>[[gnu::always_inline]] nce int32x2_t shift_right_narrow(int64x2_t a) { return vshrn_n_s64(a, n); }
1312template <int n>[[gnu::always_inline]] nce uint32x2_t shift_right_saturate_narrow_unsigned(int64x2_t a) { return vqshrun_n_s64(a, n); }
1313template <int n>[[gnu::always_inline]] nce int32x2_t shift_right_saturate_narrow(int64x2_t a) { return vqshrn_n_s64(a, n); }
1314template <int n>[[gnu::always_inline]] nce uint32x2_t shift_right_round_saturate_narrow_unsigned(int64x2_t a) { return vqrshrun_n_s64(a, n); }
1315template <int n>[[gnu::always_inline]] nce int32x2_t shift_right_round_saturate_narrow(int64x2_t a) { return vqrshrn_n_s64(a, n); }
1316template <int n>[[gnu::always_inline]] nce int32x2_t shift_right_round_narrow(int64x2_t a) { return vrshrn_n_s64(a, n); }
1317template <int n>[[gnu::always_inline]] nce int64x1_t shift_right_insert(int64x1_t a, int64x1_t b) { return vsri_n_s64(a, b, n); }
1318template <int n>[[gnu::always_inline]] nce int64x2_t shift_right_insert(int64x2_t a, int64x2_t b) { return vsriq_n_s64(a, b, n); }
1319template <int n>[[gnu::always_inline]] nce poly8x16_t shift_right_insert(poly8x16_t a, poly8x16_t b) { return vsriq_n_p8(a, b, n); }
1320template <int n>[[gnu::always_inline]] nce poly16x8_t shift_right_insert(poly16x8_t a, poly16x8_t b) { return vsriq_n_p16(a, b, n); }
1321template <> [[gnu::always_inline]] nce int8x8_t reinterpret(int64x1_t a) { return vreinterpret_s8_s64(a); }
1322template <> [[gnu::always_inline]] nce int16x4_t reinterpret(int64x1_t a) { return vreinterpret_s16_s64(a); }
1323template <> [[gnu::always_inline]] nce int32x2_t reinterpret(int64x1_t a) { return vreinterpret_s32_s64(a); }
1324template <> [[gnu::always_inline]] nce float32x2_t reinterpret(int64x1_t a) { return vreinterpret_f32_s64(a); }
1325template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(int64x1_t a) { return vreinterpret_u8_s64(a); }
1326template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(int64x1_t a) { return vreinterpret_u16_s64(a); }
1327template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(int64x1_t a) { return vreinterpret_u32_s64(a); }
1328template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(int64x1_t a) { return vreinterpret_p8_s64(a); }
1329template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(int64x1_t a) { return vreinterpret_p16_s64(a); }
1330template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(int64x1_t a) { return vreinterpret_u64_s64(a); }
1331template <> [[gnu::always_inline]] nce int8x16_t reinterpret(poly8x16_t a) { return vreinterpretq_s8_p8(a); }
1332template <> [[gnu::always_inline]] nce int16x8_t reinterpret(poly8x16_t a) { return vreinterpretq_s16_p8(a); }
1333template <> [[gnu::always_inline]] nce int32x4_t reinterpret(poly8x16_t a) { return vreinterpretq_s32_p8(a); }
1334template <> [[gnu::always_inline]] nce float32x4_t reinterpret(poly8x16_t a) { return vreinterpretq_f32_p8(a); }
1335template <> [[gnu::always_inline]] nce uint8x16_t reinterpret(poly8x16_t a) { return vreinterpretq_u8_p8(a); }
1336template <> [[gnu::always_inline]] nce uint16x8_t reinterpret(poly8x16_t a) { return vreinterpretq_u16_p8(a); }
1337template <> [[gnu::always_inline]] nce uint32x4_t reinterpret(poly8x16_t a) { return vreinterpretq_u32_p8(a); }
1338template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(poly8x16_t a) { return vreinterpretq_p16_p8(a); }
1339template <> [[gnu::always_inline]] nce uint64x2_t reinterpret(poly8x16_t a) { return vreinterpretq_u64_p8(a); }
1340template <> [[gnu::always_inline]] nce int64x2_t reinterpret(poly8x16_t a) { return vreinterpretq_s64_p8(a); }
1341template <> [[gnu::always_inline]] nce int8x16_t reinterpret(poly16x8_t a) { return vreinterpretq_s8_p16(a); }
1342template <> [[gnu::always_inline]] nce int16x8_t reinterpret(poly16x8_t a) { return vreinterpretq_s16_p16(a); }
1343template <> [[gnu::always_inline]] nce int32x4_t reinterpret(poly16x8_t a) { return vreinterpretq_s32_p16(a); }
1344template <> [[gnu::always_inline]] nce float32x4_t reinterpret(poly16x8_t a) { return vreinterpretq_f32_p16(a); }
1345template <> [[gnu::always_inline]] nce uint8x16_t reinterpret(poly16x8_t a) { return vreinterpretq_u8_p16(a); }
1346template <> [[gnu::always_inline]] nce uint16x8_t reinterpret(poly16x8_t a) { return vreinterpretq_u16_p16(a); }
1347template <> [[gnu::always_inline]] nce uint32x4_t reinterpret(poly16x8_t a) { return vreinterpretq_u32_p16(a); }
1348template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(poly16x8_t a) { return vreinterpretq_p8_p16(a); }
1349template <> [[gnu::always_inline]] nce uint64x2_t reinterpret(poly16x8_t a) { return vreinterpretq_u64_p16(a); }
1350template <> [[gnu::always_inline]] nce int64x2_t reinterpret(poly16x8_t a) { return vreinterpretq_s64_p16(a); }
1351template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(int64x2_t a) { return vreinterpretq_p8_s64(a); }
1352template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(int64x2_t a) { return vreinterpretq_p16_s64(a); }
1353[[gnu::always_inline]] nce int32x2_t move_narrow(int64x2_t a) { return vmovn_s64(a); }
1354[[gnu::always_inline]] nce int32x2_t move_saturate_narrow(int64x2_t a) { return vqmovn_s64(a); }
1355[[gnu::always_inline]] nce uint32x2_t move_unsigned_saturate_narrow(int64x2_t a) { return vqmovun_s64(a); }
1356template <int lane> [[gnu::always_inline]] nce int64x2_t multiply_add_long_lane(int64x2_t a, int32x2_t b, int32x2_t v) { return vmlal_lane_s32(a, b, v, lane); }
1357template <int lane> [[gnu::always_inline]] nce int64x2_t multiply_subtract_long_lane(int64x2_t a, int32x2_t b, int32x2_t v) { return vmlsl_lane_s32(a, b, v, lane); }
1358[[gnu::always_inline]] nce int64x2_t multiply_add_long(int64x2_t a, int32x2_t b, int32_t c) { return vmlal_n_s32(a, b, c); }
1359[[gnu::always_inline]] nce int64x2_t multiply_subtract_long(int64x2_t a, int32x2_t b, int32_t c) { return vmlsl_n_s32(a, b, c); }
1360[[gnu::always_inline]] nce poly8x16_t bitwise_not(poly8x16_t a) { return vmvnq_p8(a); }
1361[[gnu::always_inline]] nce int64x1_t bitwise_and(int64x1_t a, int64x1_t b) { return vand_s64(a, b); }
1362[[gnu::always_inline]] nce int64x2_t bitwise_and(int64x2_t a, int64x2_t b) { return vandq_s64(a, b); }
1363[[gnu::always_inline]] nce int64x1_t bitwise_or(int64x1_t a, int64x1_t b) { return vorr_s64(a, b); }
1364[[gnu::always_inline]] nce int64x2_t bitwise_or(int64x2_t a, int64x2_t b) { return vorrq_s64(a, b); }
1365[[gnu::always_inline]] nce int64x1_t bitwise_xor(int64x1_t a, int64x1_t b) { return veor_s64(a, b); }
1366[[gnu::always_inline]] nce int64x2_t bitwise_xor(int64x2_t a, int64x2_t b) { return veorq_s64(a, b); }
1367[[gnu::always_inline]] nce int64x1_t bitwise_or_not(int64x1_t a, int64x1_t b) { return vorn_s64(a, b); }
1368[[gnu::always_inline]] nce int64x2_t bitwise_or_not(int64x2_t a, int64x2_t b) { return vornq_s64(a, b); }
1369[[gnu::always_inline]] nce poly8x16_t count_active_bits(poly8x16_t a) { return vcntq_p8(a); }
1370[[gnu::always_inline]] nce int64x1_t bitwise_clear(int64x1_t a, int64x1_t b) { return vbic_s64(a, b); }
1371[[gnu::always_inline]] nce int64x2_t bitwise_clear(int64x2_t a, int64x2_t b) { return vbicq_s64(a, b); }
1372template <> [[gnu::always_inline]] nce int8x8_t create(uint64_t a) { return vcreate_s8(a); }
1373template <> [[gnu::always_inline]] nce int16x4_t create(uint64_t a) { return vcreate_s16(a); }
1374template <> [[gnu::always_inline]] nce int32x2_t create(uint64_t a) { return vcreate_s32(a); }
1375template <> [[gnu::always_inline]] nce int64x1_t create(uint64_t a) { return vcreate_s64(a); }
1376template <> [[gnu::always_inline]] nce uint8x8_t create(uint64_t a) { return vcreate_u8(a); }
1377template <> [[gnu::always_inline]] nce uint16x4_t create(uint64_t a) { return vcreate_u16(a); }
1378template <> [[gnu::always_inline]] nce uint32x2_t create(uint64_t a) { return vcreate_u32(a); }
1379template <> [[gnu::always_inline]] nce uint64x1_t create(uint64_t a) { return vcreate_u64(a); }
1380template <> [[gnu::always_inline]] nce float32x2_t create(uint64_t a) { return vcreate_f32(a); }
1381template <> [[gnu::always_inline]] nce poly8x8_t create(uint64_t a) { return vcreate_p8(a); }
1382template <> [[gnu::always_inline]] nce poly16x4_t create(uint64_t a) { return vcreate_p16(a); }
1383template <> [[gnu::always_inline]] nce int8x8_t duplicate(int8_t value) { return vdup_n_s8(value); }
1384template <> [[gnu::always_inline]] nce int8x16_t duplicate(int8_t value) { return vdupq_n_s8(value); }
1385template <> [[gnu::always_inline]] nce int16x4_t duplicate(int16_t value) { return vdup_n_s16(value); }
1386template <> [[gnu::always_inline]] nce int16x8_t duplicate(int16_t value) { return vdupq_n_s16(value); }
1387template <> [[gnu::always_inline]] nce int32x2_t duplicate(int32_t value) { return vdup_n_s32(value); }
1388template <> [[gnu::always_inline]] nce int32x4_t duplicate(int32_t value) { return vdupq_n_s32(value); }
1389template <> [[gnu::always_inline]] nce int64x1_t duplicate(int64_t value) { return vdup_n_s64(value); }
1390template <> [[gnu::always_inline]] nce int64x2_t duplicate(int64_t value) { return vdupq_n_s64(value); }
1391template <> [[gnu::always_inline]] nce uint8x8_t duplicate(uint8_t value) { return vdup_n_u8(value); }
1392template <> [[gnu::always_inline]] nce uint8x16_t duplicate(uint8_t value) { return vdupq_n_u8(value); }
1393template <> [[gnu::always_inline]] nce uint16x4_t duplicate(uint16_t value) { return vdup_n_u16(value); }
1394template <> [[gnu::always_inline]] nce uint16x8_t duplicate(uint16_t value) { return vdupq_n_u16(value); }
1395template <> [[gnu::always_inline]] nce uint32x2_t duplicate(uint32_t value) { return vdup_n_u32(value); }
1396template <> [[gnu::always_inline]] nce uint32x4_t duplicate(uint32_t value) { return vdupq_n_u32(value); }
1397template <> [[gnu::always_inline]] nce uint64x1_t duplicate(uint64_t value) { return vdup_n_u64(value); }
1398template <> [[gnu::always_inline]] nce uint64x2_t duplicate(uint64_t value) { return vdupq_n_u64(value); }
1399template <> [[gnu::always_inline]] nce float32x2_t duplicate(float32_t value) { return vdup_n_f32(value); }
1400template <> [[gnu::always_inline]] nce float32x4_t duplicate(float32_t value) { return vdupq_n_f32(value); }
1401template <> [[gnu::always_inline]] nce poly8x8_t duplicate(poly8_t value) { return vdup_n_p8(value); }
1402template <> [[gnu::always_inline]] nce poly8x16_t duplicate(poly8_t value) { return vdupq_n_p8(value); }
1403template <> [[gnu::always_inline]] nce poly16x4_t duplicate(poly16_t value) { return vdup_n_p16(value); }
1404template <> [[gnu::always_inline]] nce poly16x8_t duplicate(poly16_t value) { return vdupq_n_p16(value); }
1405template <int lane>[[gnu::always_inline]] nce int64x1_t duplicate_lane(int64x1_t a) { return vdup_lane_s64(a, lane); }
1406template <int lane>[[gnu::always_inline]] nce int64x2_t duplicate_lane_quad(int64x1_t a) { return vdupq_lane_s64(a, lane); }
1407[[gnu::always_inline]] nce int64x2_t combine(int64x1_t low, int64x1_t high) { return vcombine_s64(low, high); }
1408[[gnu::always_inline]] nce int64x1_t get_high(int64x2_t a) { return vget_high_s64(a); }
1409[[gnu::always_inline]] nce poly8x8_t get_high(poly8x16_t a) { return vget_high_p8(a); }
1410[[gnu::always_inline]] nce poly16x4_t get_high(poly16x8_t a) { return vget_high_p16(a); }
1411[[gnu::always_inline]] nce int64x1_t get_low(int64x2_t a) { return vget_low_s64(a); }
1412[[gnu::always_inline]] nce poly8x8_t get_low(poly8x16_t a) { return vget_low_p8(a); }
1413[[gnu::always_inline]] nce poly16x4_t get_low(poly16x8_t a) { return vget_low_p16(a); }
1414template <int lane>[[gnu::always_inline]] nce int64_t get_lane(int64x1_t v) { return vget_lane_s64(v, lane); }
1415template <int lane>[[gnu::always_inline]] nce poly8_t get_lane(poly8x16_t v) { return vgetq_lane_p8(v, lane); }
1416template <int lane>[[gnu::always_inline]] nce poly16_t get_lane(poly16x8_t v) { return vgetq_lane_p16(v, lane); }
1417template <int n>[[gnu::always_inline]] nce int64x1_t extract(int64x1_t a, int64x1_t b) { return vext_s64(a, b, n); }
1418template <int n>[[gnu::always_inline]] nce int64x2_t extract(int64x2_t a, int64x2_t b) { return vextq_s64(a, b, n); }
1419template <int n>[[gnu::always_inline]] nce poly8x16_t extract(poly8x16_t a, poly8x16_t b) { return vextq_p8(a, b, n); }
1420template <int n>[[gnu::always_inline]] nce poly16x8_t extract(poly16x8_t a, poly16x8_t b) { return vextq_p16(a, b, n); }
1421[[gnu::always_inline]] nce poly8x16_t reverse_64bit(poly8x16_t a) { return vrev64q_p8(a); }
1422[[gnu::always_inline]] nce poly16x8_t reverse_64bit(poly16x8_t a) { return vrev64q_p16(a); }
1423[[gnu::always_inline]] nce poly8x16_t reverse_32bit(poly8x16_t a) { return vrev32q_p8(a); }
1424[[gnu::always_inline]] nce poly16x8_t reverse_32bit(poly16x8_t a) { return vrev32q_p16(a); }
1425[[gnu::always_inline]] nce poly8x16_t reverse_16bit(poly8x16_t a) { return vrev16q_p8(a); }
1426[[gnu::always_inline]] nce poly8x16x2_t zip(poly8x16_t a, poly8x16_t b) { return vzipq_p8(a, b); }
1427[[gnu::always_inline]] nce poly16x8x2_t zip(poly16x8_t a, poly16x8_t b) { return vzipq_p16(a, b); }
1428[[gnu::always_inline]] nce poly8x16x2_t unzip(poly8x16_t a, poly8x16_t b) { return vuzpq_p8(a, b); }
1429[[gnu::always_inline]] nce poly16x8x2_t unzip(poly16x8_t a, poly16x8_t b) { return vuzpq_p16(a, b); }
1430[[gnu::always_inline]] nce poly8x16x2_t transpose(poly8x16_t a, poly8x16_t b) { return vtrnq_p8(a, b); }
1431[[gnu::always_inline]] nce poly16x8x2_t transpose(poly16x8_t a, poly16x8_t b) { return vtrnq_p16(a, b); }
1432template <int lane>[[gnu::always_inline]] nce uint8x8_t set_lane(uint8_t a, uint8x8_t v) { return vset_lane_u8(a, v, lane); }
1433template <int lane>[[gnu::always_inline]] nce uint16x4_t set_lane(uint16_t a, uint16x4_t v) { return vset_lane_u16(a, v, lane); }
1434template <int lane>[[gnu::always_inline]] nce uint32x2_t set_lane(uint32_t a, uint32x2_t v) { return vset_lane_u32(a, v, lane); }
1435template <int lane>[[gnu::always_inline]] nce uint64x1_t set_lane(uint64_t a, uint64x1_t v) { return vset_lane_u64(a, v, lane); }
1436template <int lane>[[gnu::always_inline]] nce int8x8_t set_lane(int8_t a, int8x8_t v) { return vset_lane_s8(a, v, lane); }
1437template <int lane>[[gnu::always_inline]] nce int16x4_t set_lane(int16_t a, int16x4_t v) { return vset_lane_s16(a, v, lane); }
1438template <int lane>[[gnu::always_inline]] nce int32x2_t set_lane(int32_t a, int32x2_t v) { return vset_lane_s32(a, v, lane); }
1439template <int lane>[[gnu::always_inline]] nce int64x1_t set_lane(int64_t a, int64x1_t v) { return vset_lane_s64(a, v, lane); }
1440template <int lane>[[gnu::always_inline]] nce poly8x8_t set_lane(poly8_t a, poly8x8_t v) { return vset_lane_p8(a, v, lane); }
1441template <int lane>[[gnu::always_inline]] nce poly16x4_t set_lane(poly16_t a, poly16x4_t v) { return vset_lane_p16(a, v, lane); }
1442template <int lane>[[gnu::always_inline]] nce float32x2_t set_lane(float32_t a, float32x2_t v) { return vset_lane_f32(a, v, lane); }
1443template <int lane>[[gnu::always_inline]] nce poly8x16_t set_lane(poly8_t a, poly8x16_t v) { return vsetq_lane_p8(a, v, lane); }
1444template <int lane>[[gnu::always_inline]] nce poly16x8_t set_lane(poly16_t a, poly16x8_t v) { return vsetq_lane_p16(a, v, lane); }
1445template <> [[gnu::always_inline]] inline int8x8_t load1(int8_t const *ptr) { return vld1_s8(ptr); }
1446template <> [[gnu::always_inline]] inline int8x16_t load1(int8_t const *ptr) { return vld1q_s8(ptr); }
1447template <> [[gnu::always_inline]] inline int16x4_t load1(int16_t const *ptr) { return vld1_s16(ptr); }
1448template <> [[gnu::always_inline]] inline int16x8_t load1(int16_t const *ptr) { return vld1q_s16(ptr); }
1449template <> [[gnu::always_inline]] inline int32x2_t load1(int32_t const *ptr) { return vld1_s32(ptr); }
1450template <> [[gnu::always_inline]] inline int32x4_t load1(int32_t const *ptr) { return vld1q_s32(ptr); }
1451template <> [[gnu::always_inline]] inline int64x1_t load1(int64_t const *ptr) { return vld1_s64(ptr); }
1452template <> [[gnu::always_inline]] inline int64x2_t load1(int64_t const *ptr) { return vld1q_s64(ptr); }
1453template <> [[gnu::always_inline]] inline uint8x8_t load1(uint8_t const *ptr) { return vld1_u8(ptr); }
1454template <> [[gnu::always_inline]] inline uint8x16_t load1(uint8_t const *ptr) { return vld1q_u8(ptr); }
1455template <> [[gnu::always_inline]] inline uint16x4_t load1(uint16_t const *ptr) { return vld1_u16(ptr); }
1456template <> [[gnu::always_inline]] inline uint16x8_t load1(uint16_t const *ptr) { return vld1q_u16(ptr); }
1457template <> [[gnu::always_inline]] inline uint32x2_t load1(uint32_t const *ptr) { return vld1_u32(ptr); }
1458template <> [[gnu::always_inline]] inline uint32x4_t load1(uint32_t const *ptr) { return vld1q_u32(ptr); }
1459template <> [[gnu::always_inline]] inline uint64x1_t load1(uint64_t const *ptr) { return vld1_u64(ptr); }
1460template <> [[gnu::always_inline]] inline uint64x2_t load1(uint64_t const *ptr) { return vld1q_u64(ptr); }
1461template <> [[gnu::always_inline]] inline float32x2_t load1(float32_t const *ptr) { return vld1_f32(ptr); }
1462template <> [[gnu::always_inline]] inline float32x4_t load1(float32_t const *ptr) { return vld1q_f32(ptr); }
1463template <> [[gnu::always_inline]] inline poly8x8_t load1(poly8_t const *ptr) { return vld1_p8(ptr); }
1464template <> [[gnu::always_inline]] inline poly8x16_t load1(poly8_t const *ptr) { return vld1q_p8(ptr); }
1465template <> [[gnu::always_inline]] inline poly16x4_t load1(poly16_t const *ptr) { return vld1_p16(ptr); }
1466template <> [[gnu::always_inline]] inline poly16x8_t load1(poly16_t const *ptr) { return vld1q_p16(ptr); }
1467template <int lane>[[gnu::always_inline]] nce int8x8_t load1_lane(int8_t const *ptr, int8x8_t src) { return vld1_lane_s8(ptr, src, lane); }
1468template <int lane>[[gnu::always_inline]] nce int8x16_t load1_lane_quad(int8_t const *ptr, int8x16_t src) { return vld1q_lane_s8(ptr, src, lane); }
1469template <int lane>[[gnu::always_inline]] nce int16x4_t load1_lane(int16_t const *ptr, int16x4_t src) { return vld1_lane_s16(ptr, src, lane); }
1470template <int lane>[[gnu::always_inline]] nce int16x8_t load1_lane_quad(int16_t const *ptr, int16x8_t src) { return vld1q_lane_s16(ptr, src, lane); }
1471template <int lane>[[gnu::always_inline]] nce int32x2_t load1_lane(int32_t const *ptr, int32x2_t src) { return vld1_lane_s32(ptr, src, lane); }
1472template <int lane>[[gnu::always_inline]] nce int32x4_t load1_lane_quad(int32_t const *ptr, int32x4_t src) { return vld1q_lane_s32(ptr, src, lane); }
1473template <int lane>[[gnu::always_inline]] nce int64x1_t load1_lane(int64_t const *ptr, int64x1_t src) { return vld1_lane_s64(ptr, src, lane); }
1474template <int lane>[[gnu::always_inline]] nce int64x2_t load1_lane_quad(int64_t const *ptr, int64x2_t src) { return vld1q_lane_s64(ptr, src, lane); }
1475template <int lane>[[gnu::always_inline]] nce uint8x8_t load1_lane(uint8_t const *ptr, uint8x8_t src) { return vld1_lane_u8(ptr, src, lane); }
1476template <int lane>[[gnu::always_inline]] nce uint8x16_t load1_lane_quad(uint8_t const *ptr, uint8x16_t src) { return vld1q_lane_u8(ptr, src, lane); }
1477template <int lane>[[gnu::always_inline]] nce uint16x4_t load1_lane(uint16_t const *ptr, uint16x4_t src) { return vld1_lane_u16(ptr, src, lane); }
1478template <int lane>[[gnu::always_inline]] nce uint16x8_t load1_lane_quad(uint16_t const *ptr, uint16x8_t src) { return vld1q_lane_u16(ptr, src, lane); }
1479template <int lane>[[gnu::always_inline]] nce uint32x2_t load1_lane(uint32_t const *ptr, uint32x2_t src) { return vld1_lane_u32(ptr, src, lane); }
1480template <int lane>[[gnu::always_inline]] nce uint32x4_t load1_lane_quad(uint32_t const *ptr, uint32x4_t src) { return vld1q_lane_u32(ptr, src, lane); }
1481template <int lane>[[gnu::always_inline]] nce uint64x1_t load1_lane(uint64_t const *ptr, uint64x1_t src) { return vld1_lane_u64(ptr, src, lane); }
1482template <int lane>[[gnu::always_inline]] nce uint64x2_t load1_lane_quad(uint64_t const *ptr, uint64x2_t src) { return vld1q_lane_u64(ptr, src, lane); }
1483template <int lane>[[gnu::always_inline]] nce float32x2_t load1_lane(float32_t const *ptr, float32x2_t src) { return vld1_lane_f32(ptr, src, lane); }
1484template <int lane>[[gnu::always_inline]] nce float32x4_t load1_lane_quad(float32_t const *ptr, float32x4_t src) { return vld1q_lane_f32(ptr, src, lane); }
1485template <int lane>[[gnu::always_inline]] nce poly8x8_t load1_lane(poly8_t const *ptr, poly8x8_t src) { return vld1_lane_p8(ptr, src, lane); }
1486template <int lane>[[gnu::always_inline]] nce poly8x16_t load1_lane_quad(poly8_t const *ptr, poly8x16_t src) { return vld1q_lane_p8(ptr, src, lane); }
1487template <int lane>[[gnu::always_inline]] nce poly16x4_t load1_lane(poly16_t const *ptr, poly16x4_t src) { return vld1_lane_p16(ptr, src, lane); }
1488template <int lane>[[gnu::always_inline]] nce poly16x8_t load1_lane_quad(poly16_t const *ptr, poly16x8_t src) { return vld1q_lane_p16(ptr, src, lane); }
1489template <> [[gnu::always_inline]] inline int8x8_t load1_duplicate(int8_t const *ptr) { return vld1_dup_s8(ptr); }
1490template <> [[gnu::always_inline]] inline int8x16_t load1_duplicate(int8_t const *ptr) { return vld1q_dup_s8(ptr); }
1491template <> [[gnu::always_inline]] inline int16x4_t load1_duplicate(int16_t const *ptr) { return vld1_dup_s16(ptr); }
1492template <> [[gnu::always_inline]] inline int16x8_t load1_duplicate(int16_t const *ptr) { return vld1q_dup_s16(ptr); }
1493template <> [[gnu::always_inline]] inline int32x2_t load1_duplicate(int32_t const *ptr) { return vld1_dup_s32(ptr); }
1494template <> [[gnu::always_inline]] inline int32x4_t load1_duplicate(int32_t const *ptr) { return vld1q_dup_s32(ptr); }
1495template <> [[gnu::always_inline]] inline int64x1_t load1_duplicate(int64_t const *ptr) { return vld1_dup_s64(ptr); }
1496template <> [[gnu::always_inline]] inline int64x2_t load1_duplicate(int64_t const *ptr) { return vld1q_dup_s64(ptr); }
1497template <> [[gnu::always_inline]] inline uint8x8_t load1_duplicate(uint8_t const *ptr) { return vld1_dup_u8(ptr); }
1498template <> [[gnu::always_inline]] inline uint8x16_t load1_duplicate(uint8_t const *ptr) { return vld1q_dup_u8(ptr); }
1499template <> [[gnu::always_inline]] inline uint16x4_t load1_duplicate(uint16_t const *ptr) { return vld1_dup_u16(ptr); }
1500template <> [[gnu::always_inline]] inline uint16x8_t load1_duplicate(uint16_t const *ptr) { return vld1q_dup_u16(ptr); }
1501template <> [[gnu::always_inline]] inline uint32x2_t load1_duplicate(uint32_t const *ptr) { return vld1_dup_u32(ptr); }
1502template <> [[gnu::always_inline]] inline uint32x4_t load1_duplicate(uint32_t const *ptr) { return vld1q_dup_u32(ptr); }
1503template <> [[gnu::always_inline]] inline uint64x1_t load1_duplicate(uint64_t const *ptr) { return vld1_dup_u64(ptr); }
1504template <> [[gnu::always_inline]] inline uint64x2_t load1_duplicate(uint64_t const *ptr) { return vld1q_dup_u64(ptr); }
1505template <> [[gnu::always_inline]] inline float32x2_t load1_duplicate(float32_t const *ptr) { return vld1_dup_f32(ptr); }
1506template <> [[gnu::always_inline]] inline float32x4_t load1_duplicate(float32_t const *ptr) { return vld1q_dup_f32(ptr); }
1507template <> [[gnu::always_inline]] inline poly8x8_t load1_duplicate(poly8_t const *ptr) { return vld1_dup_p8(ptr); }
1508template <> [[gnu::always_inline]] inline poly8x16_t load1_duplicate(poly8_t const *ptr) { return vld1q_dup_p8(ptr); }
1509template <> [[gnu::always_inline]] inline poly16x4_t load1_duplicate(poly16_t const *ptr) { return vld1_dup_p16(ptr); }
1510template <> [[gnu::always_inline]] inline poly16x8_t load1_duplicate(poly16_t const *ptr) { return vld1q_dup_p16(ptr); }
1511template <> [[gnu::always_inline]] inline int8x8x2_t load2(int8_t const *ptr) { return vld2_s8(ptr); }
1512template <> [[gnu::always_inline]] inline int8x16x2_t load2(int8_t const *ptr) { return vld2q_s8(ptr); }
1513template <> [[gnu::always_inline]] inline int16x4x2_t load2(int16_t const *ptr) { return vld2_s16(ptr); }
1514template <> [[gnu::always_inline]] inline int16x8x2_t load2(int16_t const *ptr) { return vld2q_s16(ptr); }
1515template <> [[gnu::always_inline]] inline int32x2x2_t load2(int32_t const *ptr) { return vld2_s32(ptr); }
1516template <> [[gnu::always_inline]] inline int32x4x2_t load2(int32_t const *ptr) { return vld2q_s32(ptr); }
1517template <> [[gnu::always_inline]] inline uint8x8x2_t load2(uint8_t const *ptr) { return vld2_u8(ptr); }
1518template <> [[gnu::always_inline]] inline uint8x16x2_t load2(uint8_t const *ptr) { return vld2q_u8(ptr); }
1519template <> [[gnu::always_inline]] inline uint16x4x2_t load2(uint16_t const *ptr) { return vld2_u16(ptr); }
1520template <> [[gnu::always_inline]] inline uint16x8x2_t load2(uint16_t const *ptr) { return vld2q_u16(ptr); }
1521template <> [[gnu::always_inline]] inline uint32x2x2_t load2(uint32_t const *ptr) { return vld2_u32(ptr); }
1522template <> [[gnu::always_inline]] inline uint32x4x2_t load2(uint32_t const *ptr) { return vld2q_u32(ptr); }
1523template <> [[gnu::always_inline]] inline float32x2x2_t load2(float32_t const *ptr) { return vld2_f32(ptr); }
1524template <> [[gnu::always_inline]] inline float32x4x2_t load2(float32_t const *ptr) { return vld2q_f32(ptr); }
1525template <> [[gnu::always_inline]] inline poly8x8x2_t load2(poly8_t const *ptr) { return vld2_p8(ptr); }
1526template <> [[gnu::always_inline]] inline poly8x16x2_t load2(poly8_t const *ptr) { return vld2q_p8(ptr); }
1527template <> [[gnu::always_inline]] inline poly16x4x2_t load2(poly16_t const *ptr) { return vld2_p16(ptr); }
1528template <> [[gnu::always_inline]] inline poly16x8x2_t load2(poly16_t const *ptr) { return vld2q_p16(ptr); }
1529template <> [[gnu::always_inline]] inline int64x1x2_t load2(int64_t const *ptr) { return vld2_s64(ptr); }
1530template <> [[gnu::always_inline]] inline uint64x1x2_t load2(uint64_t const *ptr) { return vld2_u64(ptr); }
1531template <> [[gnu::always_inline]] inline int8x8x3_t load3(int8_t const *ptr) { return vld3_s8(ptr); }
1532template <> [[gnu::always_inline]] inline int8x16x3_t load3(int8_t const *ptr) { return vld3q_s8(ptr); }
1533template <> [[gnu::always_inline]] inline int16x4x3_t load3(int16_t const *ptr) { return vld3_s16(ptr); }
1534template <> [[gnu::always_inline]] inline int16x8x3_t load3(int16_t const *ptr) { return vld3q_s16(ptr); }
1535template <> [[gnu::always_inline]] inline int32x2x3_t load3(int32_t const *ptr) { return vld3_s32(ptr); }
1536template <> [[gnu::always_inline]] inline int32x4x3_t load3(int32_t const *ptr) { return vld3q_s32(ptr); }
1537template <> [[gnu::always_inline]] inline uint8x8x3_t load3(uint8_t const *ptr) { return vld3_u8(ptr); }
1538template <> [[gnu::always_inline]] inline uint8x16x3_t load3(uint8_t const *ptr) { return vld3q_u8(ptr); }
1539template <> [[gnu::always_inline]] inline uint16x4x3_t load3(uint16_t const *ptr) { return vld3_u16(ptr); }
1540template <> [[gnu::always_inline]] inline uint16x8x3_t load3(uint16_t const *ptr) { return vld3q_u16(ptr); }
1541template <> [[gnu::always_inline]] inline uint32x2x3_t load3(uint32_t const *ptr) { return vld3_u32(ptr); }
1542template <> [[gnu::always_inline]] inline uint32x4x3_t load3(uint32_t const *ptr) { return vld3q_u32(ptr); }
1543template <> [[gnu::always_inline]] inline float32x2x3_t load3(float32_t const *ptr) { return vld3_f32(ptr); }
1544template <> [[gnu::always_inline]] inline float32x4x3_t load3(float32_t const *ptr) { return vld3q_f32(ptr); }
1545template <> [[gnu::always_inline]] inline poly8x8x3_t load3(poly8_t const *ptr) { return vld3_p8(ptr); }
1546template <> [[gnu::always_inline]] inline poly8x16x3_t load3(poly8_t const *ptr) { return vld3q_p8(ptr); }
1547template <> [[gnu::always_inline]] inline poly16x4x3_t load3(poly16_t const *ptr) { return vld3_p16(ptr); }
1548template <> [[gnu::always_inline]] inline poly16x8x3_t load3(poly16_t const *ptr) { return vld3q_p16(ptr); }
1549template <> [[gnu::always_inline]] inline int64x1x3_t load3(int64_t const *ptr) { return vld3_s64(ptr); }
1550template <> [[gnu::always_inline]] inline uint64x1x3_t load3(uint64_t const *ptr) { return vld3_u64(ptr); }
1551template <> [[gnu::always_inline]] inline int8x8x4_t load4(int8_t const *ptr) { return vld4_s8(ptr); }
1552template <> [[gnu::always_inline]] inline int8x16x4_t load4(int8_t const *ptr) { return vld4q_s8(ptr); }
1553template <> [[gnu::always_inline]] inline int16x4x4_t load4(int16_t const *ptr) { return vld4_s16(ptr); }
1554template <> [[gnu::always_inline]] inline int16x8x4_t load4(int16_t const *ptr) { return vld4q_s16(ptr); }
1555template <> [[gnu::always_inline]] inline int32x2x4_t load4(int32_t const *ptr) { return vld4_s32(ptr); }
1556template <> [[gnu::always_inline]] inline int32x4x4_t load4(int32_t const *ptr) { return vld4q_s32(ptr); }
1557template <> [[gnu::always_inline]] inline uint8x8x4_t load4(uint8_t const *ptr) { return vld4_u8(ptr); }
1558template <> [[gnu::always_inline]] inline uint8x16x4_t load4(uint8_t const *ptr) { return vld4q_u8(ptr); }
1559template <> [[gnu::always_inline]] inline uint16x4x4_t load4(uint16_t const *ptr) { return vld4_u16(ptr); }
1560template <> [[gnu::always_inline]] inline uint16x8x4_t load4(uint16_t const *ptr) { return vld4q_u16(ptr); }
1561template <> [[gnu::always_inline]] inline uint32x2x4_t load4(uint32_t const *ptr) { return vld4_u32(ptr); }
1562template <> [[gnu::always_inline]] inline uint32x4x4_t load4(uint32_t const *ptr) { return vld4q_u32(ptr); }
1563template <> [[gnu::always_inline]] inline float32x2x4_t load4(float32_t const *ptr) { return vld4_f32(ptr); }
1564template <> [[gnu::always_inline]] inline float32x4x4_t load4(float32_t const *ptr) { return vld4q_f32(ptr); }
1565template <> [[gnu::always_inline]] inline poly8x8x4_t load4(poly8_t const *ptr) { return vld4_p8(ptr); }
1566template <> [[gnu::always_inline]] inline poly8x16x4_t load4(poly8_t const *ptr) { return vld4q_p8(ptr); }
1567template <> [[gnu::always_inline]] inline poly16x4x4_t load4(poly16_t const *ptr) { return vld4_p16(ptr); }
1568template <> [[gnu::always_inline]] inline poly16x8x4_t load4(poly16_t const *ptr) { return vld4q_p16(ptr); }
1569template <> [[gnu::always_inline]] inline int64x1x4_t load4(int64_t const *ptr) { return vld4_s64(ptr); }
1570template <> [[gnu::always_inline]] inline uint64x1x4_t load4(uint64_t const *ptr) { return vld4_u64(ptr); }
1571template <> [[gnu::always_inline]] inline int8x8x2_t load2_duplicate(int8_t const *ptr) { return vld2_dup_s8(ptr); }
1572template <> [[gnu::always_inline]] inline int16x4x2_t load2_duplicate(int16_t const *ptr) { return vld2_dup_s16(ptr); }
1573template <> [[gnu::always_inline]] inline int32x2x2_t load2_duplicate(int32_t const *ptr) { return vld2_dup_s32(ptr); }
1574template <> [[gnu::always_inline]] inline uint8x8x2_t load2_duplicate(uint8_t const *ptr) { return vld2_dup_u8(ptr); }
1575template <> [[gnu::always_inline]] inline uint16x4x2_t load2_duplicate(uint16_t const *ptr) { return vld2_dup_u16(ptr); }
1576template <> [[gnu::always_inline]] inline uint32x2x2_t load2_duplicate(uint32_t const *ptr) { return vld2_dup_u32(ptr); }
1577template <> [[gnu::always_inline]] inline float32x2x2_t load2_duplicate(float32_t const *ptr) { return vld2_dup_f32(ptr); }
1578template <> [[gnu::always_inline]] inline poly8x8x2_t load2_duplicate(poly8_t const *ptr) { return vld2_dup_p8(ptr); }
1579template <> [[gnu::always_inline]] inline poly16x4x2_t load2_duplicate(poly16_t const *ptr) { return vld2_dup_p16(ptr); }
1580template <> [[gnu::always_inline]] inline int64x1x2_t load2_duplicate(int64_t const *ptr) { return vld2_dup_s64(ptr); }
1581template <> [[gnu::always_inline]] inline uint64x1x2_t load2_duplicate(uint64_t const *ptr) { return vld2_dup_u64(ptr); }
1582template <> [[gnu::always_inline]] inline int8x8x3_t load3_duplicate(int8_t const *ptr) { return vld3_dup_s8(ptr); }
1583template <> [[gnu::always_inline]] inline int16x4x3_t load3_duplicate(int16_t const *ptr) { return vld3_dup_s16(ptr); }
1584template <> [[gnu::always_inline]] inline int32x2x3_t load3_duplicate(int32_t const *ptr) { return vld3_dup_s32(ptr); }
1585template <> [[gnu::always_inline]] inline uint8x8x3_t load3_duplicate(uint8_t const *ptr) { return vld3_dup_u8(ptr); }
1586template <> [[gnu::always_inline]] inline uint16x4x3_t load3_duplicate(uint16_t const *ptr) { return vld3_dup_u16(ptr); }
1587template <> [[gnu::always_inline]] inline uint32x2x3_t load3_duplicate(uint32_t const *ptr) { return vld3_dup_u32(ptr); }
1588template <> [[gnu::always_inline]] inline float32x2x3_t load3_duplicate(float32_t const *ptr) { return vld3_dup_f32(ptr); }
1589template <> [[gnu::always_inline]] inline poly8x8x3_t load3_duplicate(poly8_t const *ptr) { return vld3_dup_p8(ptr); }
1590template <> [[gnu::always_inline]] inline poly16x4x3_t load3_duplicate(poly16_t const *ptr) { return vld3_dup_p16(ptr); }
1591template <> [[gnu::always_inline]] inline int64x1x3_t load3_duplicate(int64_t const *ptr) { return vld3_dup_s64(ptr); }
1592template <> [[gnu::always_inline]] inline uint64x1x3_t load3_duplicate(uint64_t const *ptr) { return vld3_dup_u64(ptr); }
1593template <> [[gnu::always_inline]] inline int8x8x4_t load4_duplicate(int8_t const *ptr) { return vld4_dup_s8(ptr); }
1594template <> [[gnu::always_inline]] inline int16x4x4_t load4_duplicate(int16_t const *ptr) { return vld4_dup_s16(ptr); }
1595template <> [[gnu::always_inline]] inline int32x2x4_t load4_duplicate(int32_t const *ptr) { return vld4_dup_s32(ptr); }
1596template <> [[gnu::always_inline]] inline uint8x8x4_t load4_duplicate(uint8_t const *ptr) { return vld4_dup_u8(ptr); }
1597template <> [[gnu::always_inline]] inline uint16x4x4_t load4_duplicate(uint16_t const *ptr) { return vld4_dup_u16(ptr); }
1598template <> [[gnu::always_inline]] inline uint32x2x4_t load4_duplicate(uint32_t const *ptr) { return vld4_dup_u32(ptr); }
1599template <> [[gnu::always_inline]] inline float32x2x4_t load4_duplicate(float32_t const *ptr) { return vld4_dup_f32(ptr); }
1600template <> [[gnu::always_inline]] inline poly8x8x4_t load4_duplicate(poly8_t const *ptr) { return vld4_dup_p8(ptr); }
1601template <> [[gnu::always_inline]] inline poly16x4x4_t load4_duplicate(poly16_t const *ptr) { return vld4_dup_p16(ptr); }
1602#ifdef __clang__
1603template <> [[gnu::always_inline]] inline int8x16x2_t load2_duplicate(int8_t const *ptr) { return vld2q_dup_s8(ptr); }
1604template <> [[gnu::always_inline]] inline int16x8x2_t load2_duplicate(int16_t const *ptr) { return vld2q_dup_s16(ptr); }
1605template <> [[gnu::always_inline]] inline int32x4x2_t load2_duplicate(int32_t const *ptr) { return vld2q_dup_s32(ptr); }
1606template <> [[gnu::always_inline]] inline uint8x16x2_t load2_duplicate(uint8_t const *ptr) { return vld2q_dup_u8(ptr); }
1607template <> [[gnu::always_inline]] inline uint16x8x2_t load2_duplicate(uint16_t const *ptr) { return vld2q_dup_u16(ptr); }
1608template <> [[gnu::always_inline]] inline uint32x4x2_t load2_duplicate(uint32_t const *ptr) { return vld2q_dup_u32(ptr); }
1609template <> [[gnu::always_inline]] inline float32x4x2_t load2_duplicate(float32_t const *ptr) { return vld2q_dup_f32(ptr); }
1610template <> [[gnu::always_inline]] inline poly8x16x2_t load2_duplicate(poly8_t const *ptr) { return vld2q_dup_p8(ptr); }
1611template <> [[gnu::always_inline]] inline poly16x8x2_t load2_duplicate(poly16_t const *ptr) { return vld2q_dup_p16(ptr); }
1612template <> [[gnu::always_inline]] inline int8x16x3_t load3_duplicate(int8_t const *ptr) { return vld3q_dup_s8(ptr); }
1613template <> [[gnu::always_inline]] inline int16x8x3_t load3_duplicate(int16_t const *ptr) { return vld3q_dup_s16(ptr); }
1614template <> [[gnu::always_inline]] inline int32x4x3_t load3_duplicate(int32_t const *ptr) { return vld3q_dup_s32(ptr); }
1615template <> [[gnu::always_inline]] inline uint8x16x3_t load3_duplicate(uint8_t const *ptr) { return vld3q_dup_u8(ptr); }
1616template <> [[gnu::always_inline]] inline uint16x8x3_t load3_duplicate(uint16_t const *ptr) { return vld3q_dup_u16(ptr); }
1617template <> [[gnu::always_inline]] inline uint32x4x3_t load3_duplicate(uint32_t const *ptr) { return vld3q_dup_u32(ptr); }
1618template <> [[gnu::always_inline]] inline float32x4x3_t load3_duplicate(float32_t const *ptr) { return vld3q_dup_f32(ptr); }
1619template <> [[gnu::always_inline]] inline poly8x16x3_t load3_duplicate(poly8_t const *ptr) { return vld3q_dup_p8(ptr); }
1620template <> [[gnu::always_inline]] inline poly16x8x3_t load3_duplicate(poly16_t const *ptr) { return vld3q_dup_p16(ptr); }
1621template <> [[gnu::always_inline]] inline int8x16x4_t load4_duplicate(int8_t const *ptr) { return vld4q_dup_s8(ptr); }
1622template <> [[gnu::always_inline]] inline int16x8x4_t load4_duplicate(int16_t const *ptr) { return vld4q_dup_s16(ptr); }
1623template <> [[gnu::always_inline]] inline int32x4x4_t load4_duplicate(int32_t const *ptr) { return vld4q_dup_s32(ptr); }
1624template <> [[gnu::always_inline]] inline uint8x16x4_t load4_duplicate(uint8_t const *ptr) { return vld4q_dup_u8(ptr); }
1625template <> [[gnu::always_inline]] inline uint16x8x4_t load4_duplicate(uint16_t const *ptr) { return vld4q_dup_u16(ptr); }
1626template <> [[gnu::always_inline]] inline uint32x4x4_t load4_duplicate(uint32_t const *ptr) { return vld4q_dup_u32(ptr); }
1627template <> [[gnu::always_inline]] inline float32x4x4_t load4_duplicate(float32_t const *ptr) { return vld4q_dup_f32(ptr); }
1628template <> [[gnu::always_inline]] inline poly8x16x4_t load4_duplicate(poly8_t const *ptr) { return vld4q_dup_p8(ptr); }
1629template <> [[gnu::always_inline]] inline poly16x8x4_t load4_duplicate(poly16_t const *ptr) { return vld4q_dup_p16(ptr); }
1630#endif
1631[[gnu::always_inline]] inline int64x1x4_t load4_duplicate(int64_t const *ptr) { return vld4_dup_s64(ptr); }
1632[[gnu::always_inline]] inline uint64x1x4_t load4_duplicate(uint64_t const *ptr) { return vld4_dup_u64(ptr); }
1633template <int lane>[[gnu::always_inline]] nce int16x4x2_t load2_lane(int16_t const *ptr, int16x4x2_t src) { return vld2_lane_s16(ptr, src, lane); }
1634template <int lane>[[gnu::always_inline]] nce int16x8x2_t load2_lane_quad(int16_t const *ptr, int16x8x2_t src) { return vld2q_lane_s16(ptr, src, lane); }
1635template <int lane>[[gnu::always_inline]] nce int32x2x2_t load2_lane(int32_t const *ptr, int32x2x2_t src) { return vld2_lane_s32(ptr, src, lane); }
1636template <int lane>[[gnu::always_inline]] nce int32x4x2_t load2_lane_quad(int32_t const *ptr, int32x4x2_t src) { return vld2q_lane_s32(ptr, src, lane); }
1637template <int lane>[[gnu::always_inline]] nce uint16x4x2_t load2_lane(uint16_t const *ptr, uint16x4x2_t src) { return vld2_lane_u16(ptr, src, lane); }
1638template <int lane>[[gnu::always_inline]] nce uint16x8x2_t load2_lane_quad(uint16_t const *ptr, uint16x8x2_t src) { return vld2q_lane_u16(ptr, src, lane); }
1639template <int lane>[[gnu::always_inline]] nce uint32x2x2_t load2_lane(uint32_t const *ptr, uint32x2x2_t src) { return vld2_lane_u32(ptr, src, lane); }
1640template <int lane>[[gnu::always_inline]] nce uint32x4x2_t load2_lane_quad(uint32_t const *ptr, uint32x4x2_t src) { return vld2q_lane_u32(ptr, src, lane); }
1641template <int lane>[[gnu::always_inline]] nce float32x2x2_t load2_lane(float32_t const *ptr, float32x2x2_t src) { return vld2_lane_f32(ptr, src, lane); }
1642template <int lane>[[gnu::always_inline]] nce float32x4x2_t load2_lane_quad(float32_t const *ptr, float32x4x2_t src) { return vld2q_lane_f32(ptr, src, lane); }
1643template <int lane>[[gnu::always_inline]] nce poly16x4x2_t load2_lane(poly16_t const *ptr, poly16x4x2_t src) { return vld2_lane_p16(ptr, src, lane); }
1644template <int lane>[[gnu::always_inline]] nce poly16x8x2_t load2_lane_quad(poly16_t const *ptr, poly16x8x2_t src) { return vld2q_lane_p16(ptr, src, lane); }
1645template <int lane>[[gnu::always_inline]] nce int8x8x2_t load2_lane(int8_t const *ptr, int8x8x2_t src) { return vld2_lane_s8(ptr, src, lane); }
1646template <int lane>[[gnu::always_inline]] nce uint8x8x2_t load2_lane(uint8_t const *ptr, uint8x8x2_t src) { return vld2_lane_u8(ptr, src, lane); }
1647template <int lane>[[gnu::always_inline]] nce poly8x8x2_t load2_lane(poly8_t const *ptr, poly8x8x2_t src) { return vld2_lane_p8(ptr, src, lane); }
1648template <int lane>[[gnu::always_inline]] nce int16x4x3_t load3_lane(int16_t const *ptr, int16x4x3_t src) { return vld3_lane_s16(ptr, src, lane); }
1649template <int lane>[[gnu::always_inline]] nce int16x8x3_t load3_lane_quad(int16_t const *ptr, int16x8x3_t src) { return vld3q_lane_s16(ptr, src, lane); }
1650template <int lane>[[gnu::always_inline]] nce int32x2x3_t load3_lane(int32_t const *ptr, int32x2x3_t src) { return vld3_lane_s32(ptr, src, lane); }
1651template <int lane>[[gnu::always_inline]] nce int32x4x3_t load3_lane_quad(int32_t const *ptr, int32x4x3_t src) { return vld3q_lane_s32(ptr, src, lane); }
1652template <int lane>[[gnu::always_inline]] nce uint16x4x3_t load3_lane(uint16_t const *ptr, uint16x4x3_t src) { return vld3_lane_u16(ptr, src, lane); }
1653template <int lane>[[gnu::always_inline]] nce uint16x8x3_t load3_lane_quad(uint16_t const *ptr, uint16x8x3_t src) { return vld3q_lane_u16(ptr, src, lane); }
1654template <int lane>[[gnu::always_inline]] nce uint32x2x3_t load3_lane(uint32_t const *ptr, uint32x2x3_t src) { return vld3_lane_u32(ptr, src, lane); }
1655template <int lane>[[gnu::always_inline]] nce uint32x4x3_t load3_lane_quad(uint32_t const *ptr, uint32x4x3_t src) { return vld3q_lane_u32(ptr, src, lane); }
1656template <int lane>[[gnu::always_inline]] nce float32x2x3_t load3_lane(float32_t const *ptr, float32x2x3_t src) { return vld3_lane_f32(ptr, src, lane); }
1657template <int lane>[[gnu::always_inline]] nce float32x4x3_t load3_lane_quad(float32_t const *ptr, float32x4x3_t src) { return vld3q_lane_f32(ptr, src, lane); }
1658template <int lane>[[gnu::always_inline]] nce poly16x4x3_t load3_lane(poly16_t const *ptr, poly16x4x3_t src) { return vld3_lane_p16(ptr, src, lane); }
1659template <int lane>[[gnu::always_inline]] nce poly16x8x3_t load3_lane_quad(poly16_t const *ptr, poly16x8x3_t src) { return vld3q_lane_p16(ptr, src, lane); }
1660template <int lane>[[gnu::always_inline]] nce int8x8x3_t load3_lane(int8_t const *ptr, int8x8x3_t src) { return vld3_lane_s8(ptr, src, lane); }
1661template <int lane>[[gnu::always_inline]] nce uint8x8x3_t load3_lane(uint8_t const *ptr, uint8x8x3_t src) { return vld3_lane_u8(ptr, src, lane); }
1662template <int lane>[[gnu::always_inline]] nce poly8x8x3_t load3_lane(poly8_t const *ptr, poly8x8x3_t src) { return vld3_lane_p8(ptr, src, lane); }
1663template <int lane>[[gnu::always_inline]] nce int16x4x4_t load4_lane(int16_t const *ptr, int16x4x4_t src) { return vld4_lane_s16(ptr, src, lane); }
1664template <int lane>[[gnu::always_inline]] nce int16x8x4_t load4_lane_quad(int16_t const *ptr, int16x8x4_t src) { return vld4q_lane_s16(ptr, src, lane); }
1665template <int lane>[[gnu::always_inline]] nce int32x2x4_t load4_lane(int32_t const *ptr, int32x2x4_t src) { return vld4_lane_s32(ptr, src, lane); }
1666template <int lane>[[gnu::always_inline]] nce int32x4x4_t load4_lane_quad(int32_t const *ptr, int32x4x4_t src) { return vld4q_lane_s32(ptr, src, lane); }
1667template <int lane>[[gnu::always_inline]] nce uint16x4x4_t load4_lane(uint16_t const *ptr, uint16x4x4_t src) { return vld4_lane_u16(ptr, src, lane); }
1668template <int lane>[[gnu::always_inline]] nce uint16x8x4_t load4_lane_quad(uint16_t const *ptr, uint16x8x4_t src) { return vld4q_lane_u16(ptr, src, lane); }
1669template <int lane>[[gnu::always_inline]] nce uint32x2x4_t load4_lane(uint32_t const *ptr, uint32x2x4_t src) { return vld4_lane_u32(ptr, src, lane); }
1670template <int lane>[[gnu::always_inline]] nce uint32x4x4_t load4_lane_quad(uint32_t const *ptr, uint32x4x4_t src) { return vld4q_lane_u32(ptr, src, lane); }
1671template <int lane>[[gnu::always_inline]] nce float32x2x4_t load4_lane(float32_t const *ptr, float32x2x4_t src) { return vld4_lane_f32(ptr, src, lane); }
1672template <int lane>[[gnu::always_inline]] nce float32x4x4_t load4_lane_quad(float32_t const *ptr, float32x4x4_t src) { return vld4q_lane_f32(ptr, src, lane); }
1673template <int lane>[[gnu::always_inline]] nce poly16x4x4_t load4_lane(poly16_t const *ptr, poly16x4x4_t src) { return vld4_lane_p16(ptr, src, lane); }
1674template <int lane>[[gnu::always_inline]] nce poly16x8x4_t load4_lane_quad(poly16_t const *ptr, poly16x8x4_t src) { return vld4q_lane_p16(ptr, src, lane); }
1675template <int lane>[[gnu::always_inline]] nce int8x8x4_t load4_lane(int8_t const *ptr, int8x8x4_t src) { return vld4_lane_s8(ptr, src, lane); }
1676template <int lane>[[gnu::always_inline]] nce uint8x8x4_t load4_lane(uint8_t const *ptr, uint8x8x4_t src) { return vld4_lane_u8(ptr, src, lane); }
1677template <int lane>[[gnu::always_inline]] nce poly8x8x4_t load4_lane(poly8_t const *ptr, poly8x8x4_t src) { return vld4_lane_p8(ptr, src, lane); }
1678#ifdef __clang__
1679template <int lane>[[gnu::always_inline]] nce int8x16x2_t load2_lane_quad(int8_t const *ptr, int8x16x2_t src) { return vld2q_lane_s8(ptr, src, lane); }
1680template <int lane>[[gnu::always_inline]] nce uint8x16x2_t load2_lane_quad(uint8_t const *ptr, uint8x16x2_t src) { return vld2q_lane_u8(ptr, src, lane); }
1681template <int lane>[[gnu::always_inline]] nce int8x16x3_t load3_lane_quad(int8_t const *ptr, int8x16x3_t src) { return vld3q_lane_s8(ptr, src, lane); }
1682template <int lane>[[gnu::always_inline]] nce uint8x16x3_t load3_lane_quad(uint8_t const *ptr, uint8x16x3_t src) { return vld3q_lane_u8(ptr, src, lane); }
1683template <int lane>[[gnu::always_inline]] nce int8x16x4_t load4_lane_quad(int8_t const *ptr, int8x16x4_t src) { return vld4q_lane_s8(ptr, src, lane); }
1684template <int lane>[[gnu::always_inline]] nce uint8x16x4_t load4_lane_quad(uint8_t const *ptr, uint8x16x4_t src) { return vld4q_lane_u8(ptr, src, lane); }
1685#endif
1686#if defined(__clang__) || (__GNUC__ > 13)
1687template <> [[gnu::always_inline]] inline int8x8x2_t load1_x2(int8_t const *ptr) { return vld1_s8_x2(ptr); }
1688template <> [[gnu::always_inline]] inline int8x16x2_t load1_x2(int8_t const *ptr) { return vld1q_s8_x2(ptr); }
1689template <> [[gnu::always_inline]] inline int16x4x2_t load1_x2(int16_t const *ptr) { return vld1_s16_x2(ptr); }
1690template <> [[gnu::always_inline]] inline int16x8x2_t load1_x2(int16_t const *ptr) { return vld1q_s16_x2(ptr); }
1691template <> [[gnu::always_inline]] inline int32x2x2_t load1_x2(int32_t const *ptr) { return vld1_s32_x2(ptr); }
1692template <> [[gnu::always_inline]] inline int32x4x2_t load1_x2(int32_t const *ptr) { return vld1q_s32_x2(ptr); }
1693template <> [[gnu::always_inline]] inline uint8x8x2_t load1_x2(uint8_t const *ptr) { return vld1_u8_x2(ptr); }
1694template <> [[gnu::always_inline]] inline uint8x16x2_t load1_x2(uint8_t const *ptr) { return vld1q_u8_x2(ptr); }
1695template <> [[gnu::always_inline]] inline uint16x4x2_t load1_x2(uint16_t const *ptr) { return vld1_u16_x2(ptr); }
1696template <> [[gnu::always_inline]] inline uint16x8x2_t load1_x2(uint16_t const *ptr) { return vld1q_u16_x2(ptr); }
1697template <> [[gnu::always_inline]] inline uint32x2x2_t load1_x2(uint32_t const *ptr) { return vld1_u32_x2(ptr); }
1698template <> [[gnu::always_inline]] inline uint32x4x2_t load1_x2(uint32_t const *ptr) { return vld1q_u32_x2(ptr); }
1699template <> [[gnu::always_inline]] inline float32x2x2_t load1_x2(float32_t const *ptr) { return vld1_f32_x2(ptr); }
1700template <> [[gnu::always_inline]] inline float32x4x2_t load1_x2(float32_t const *ptr) { return vld1q_f32_x2(ptr); }
1701template <> [[gnu::always_inline]] inline poly8x8x2_t load1_x2(poly8_t const *ptr) { return vld1_p8_x2(ptr); }
1702template <> [[gnu::always_inline]] inline poly8x16x2_t load1_x2(poly8_t const *ptr) { return vld1q_p8_x2(ptr); }
1703template <> [[gnu::always_inline]] inline poly16x4x2_t load1_x2(poly16_t const *ptr) { return vld1_p16_x2(ptr); }
1704template <> [[gnu::always_inline]] inline poly16x8x2_t load1_x2(poly16_t const *ptr) { return vld1q_p16_x2(ptr); }
1705template <> [[gnu::always_inline]] inline int64x1x2_t load1_x2(int64_t const *ptr) { return vld1_s64_x2(ptr); }
1706template <> [[gnu::always_inline]] inline uint64x1x2_t load1_x2(uint64_t const *ptr) { return vld1_u64_x2(ptr); }
1707template <> [[gnu::always_inline]] inline int64x2x2_t load1_x2(int64_t const *ptr) { return vld1q_s64_x2(ptr); }
1708template <> [[gnu::always_inline]] inline uint64x2x2_t load1_x2(uint64_t const *ptr) { return vld1q_u64_x2(ptr); }
1709template <> [[gnu::always_inline]] inline int8x8x3_t load1_x3(int8_t const *ptr) { return vld1_s8_x3(ptr); }
1710template <> [[gnu::always_inline]] inline int16x4x3_t load1_x3(int16_t const *ptr) { return vld1_s16_x3(ptr); }
1711
1712#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_ARCH) && (__ARM_ARCH < 8)
1713template <> [[gnu::always_inline]] inline int8x16x3_t load1_x3(int8_t const *ptr) { return vld1q_s8_x3((const uint8_t*)ptr); }
1714template <> [[gnu::always_inline]] inline int16x8x3_t load1_x3(int16_t const *ptr) { return vld1q_s16_x3((const uint16_t*)ptr); }
1715#else
1716template <> [[gnu::always_inline]] inline int8x16x3_t load1_x3(int8_t const *ptr) { return vld1q_s8_x3(ptr); }
1717template <> [[gnu::always_inline]] inline int16x8x3_t load1_x3(int16_t const *ptr) { return vld1q_s16_x3(ptr); }
1718#endif
1719
1720template <> [[gnu::always_inline]] inline int32x2x3_t load1_x3(int32_t const *ptr) { return vld1_s32_x3(ptr); }
1721template <> [[gnu::always_inline]] inline int32x4x3_t load1_x3(int32_t const *ptr) { return vld1q_s32_x3(ptr); }
1722template <> [[gnu::always_inline]] inline uint8x8x3_t load1_x3(uint8_t const *ptr) { return vld1_u8_x3(ptr); }
1723template <> [[gnu::always_inline]] inline uint8x16x3_t load1_x3(uint8_t const *ptr) { return vld1q_u8_x3(ptr); }
1724template <> [[gnu::always_inline]] inline uint16x4x3_t load1_x3(uint16_t const *ptr) { return vld1_u16_x3(ptr); }
1725template <> [[gnu::always_inline]] inline uint16x8x3_t load1_x3(uint16_t const *ptr) { return vld1q_u16_x3(ptr); }
1726template <> [[gnu::always_inline]] inline uint32x2x3_t load1_x3(uint32_t const *ptr) { return vld1_u32_x3(ptr); }
1727template <> [[gnu::always_inline]] inline uint32x4x3_t load1_x3(uint32_t const *ptr) { return vld1q_u32_x3(ptr); }
1728template <> [[gnu::always_inline]] inline float32x2x3_t load1_x3(float32_t const *ptr) { return vld1_f32_x3(ptr); }
1729template <> [[gnu::always_inline]] inline float32x4x3_t load1_x3(float32_t const *ptr) { return vld1q_f32_x3(ptr); }
1730template <> [[gnu::always_inline]] inline poly8x8x3_t load1_x3(poly8_t const *ptr) { return vld1_p8_x3(ptr); }
1731template <> [[gnu::always_inline]] inline poly8x16x3_t load1_x3(poly8_t const *ptr) { return vld1q_p8_x3(ptr); }
1732template <> [[gnu::always_inline]] inline poly16x4x3_t load1_x3(poly16_t const *ptr) { return vld1_p16_x3(ptr); }
1733template <> [[gnu::always_inline]] inline poly16x8x3_t load1_x3(poly16_t const *ptr) { return vld1q_p16_x3(ptr); }
1734template <> [[gnu::always_inline]] inline int64x1x3_t load1_x3(int64_t const *ptr) { return vld1_s64_x3(ptr); }
1735template <> [[gnu::always_inline]] inline uint64x1x3_t load1_x3(uint64_t const *ptr) { return vld1_u64_x3(ptr); }
1736template <> [[gnu::always_inline]] inline int64x2x3_t load1_x3(int64_t const *ptr) { return vld1q_s64_x3(ptr); }
1737template <> [[gnu::always_inline]] inline uint64x2x3_t load1_x3(uint64_t const *ptr) { return vld1q_u64_x3(ptr); }
1738template <> [[gnu::always_inline]] inline int8x8x4_t load1_x4(int8_t const *ptr) { return vld1_s8_x4(ptr); }
1739
1740#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_ARCH) && (__ARM_ARCH < 8)
1741template <> [[gnu::always_inline]] inline int8x16x4_t load1_x4(int8_t const *ptr) { return vld1q_s8_x4((const uint8_t*)ptr); }
1742template <> [[gnu::always_inline]] inline int16x8x4_t load1_x4(int16_t const *ptr) { return vld1q_s16_x4((const uint16_t*)ptr); }
1743#else
1744template <> [[gnu::always_inline]] inline int8x16x4_t load1_x4(int8_t const *ptr) { return vld1q_s8_x4(ptr); }
1745template <> [[gnu::always_inline]] inline int16x8x4_t load1_x4(int16_t const *ptr) { return vld1q_s16_x4(ptr); }
1746#endif
1747
1748template <> [[gnu::always_inline]] inline int16x4x4_t load1_x4(int16_t const *ptr) { return vld1_s16_x4(ptr); }
1749template <> [[gnu::always_inline]] inline int32x2x4_t load1_x4(int32_t const *ptr) { return vld1_s32_x4(ptr); }
1750template <> [[gnu::always_inline]] inline int32x4x4_t load1_x4(int32_t const *ptr) { return vld1q_s32_x4(ptr); }
1751template <> [[gnu::always_inline]] inline uint8x8x4_t load1_x4(uint8_t const *ptr) { return vld1_u8_x4(ptr); }
1752template <> [[gnu::always_inline]] inline uint8x16x4_t load1_x4(uint8_t const *ptr) { return vld1q_u8_x4(ptr); }
1753template <> [[gnu::always_inline]] inline uint16x4x4_t load1_x4(uint16_t const *ptr) { return vld1_u16_x4(ptr); }
1754template <> [[gnu::always_inline]] inline uint16x8x4_t load1_x4(uint16_t const *ptr) { return vld1q_u16_x4(ptr); }
1755template <> [[gnu::always_inline]] inline uint32x2x4_t load1_x4(uint32_t const *ptr) { return vld1_u32_x4(ptr); }
1756template <> [[gnu::always_inline]] inline uint32x4x4_t load1_x4(uint32_t const *ptr) { return vld1q_u32_x4(ptr); }
1757template <> [[gnu::always_inline]] inline float32x2x4_t load1_x4(float32_t const *ptr) { return vld1_f32_x4(ptr); }
1758template <> [[gnu::always_inline]] inline float32x4x4_t load1_x4(float32_t const *ptr) { return vld1q_f32_x4(ptr); }
1759template <> [[gnu::always_inline]] inline poly8x8x4_t load1_x4(poly8_t const *ptr) { return vld1_p8_x4(ptr); }
1760template <> [[gnu::always_inline]] inline poly8x16x4_t load1_x4(poly8_t const *ptr) { return vld1q_p8_x4(ptr); }
1761template <> [[gnu::always_inline]] inline poly16x4x4_t load1_x4(poly16_t const *ptr) { return vld1_p16_x4(ptr); }
1762template <> [[gnu::always_inline]] inline poly16x8x4_t load1_x4(poly16_t const *ptr) { return vld1q_p16_x4(ptr); }
1763template <> [[gnu::always_inline]] inline int64x1x4_t load1_x4(int64_t const *ptr) { return vld1_s64_x4(ptr); }
1764template <> [[gnu::always_inline]] inline uint64x1x4_t load1_x4(uint64_t const *ptr) { return vld1_u64_x4(ptr); }
1765template <> [[gnu::always_inline]] inline int64x2x4_t load1_x4(int64_t const *ptr) { return vld1q_s64_x4(ptr); }
1766template <> [[gnu::always_inline]] inline uint64x2x4_t load1_x4(uint64_t const *ptr) { return vld1q_u64_x4(ptr); }
1767#endif
1768[[gnu::always_inline]] inline void store1(int8_t *ptr, int8x8_t val) { return vst1_s8(ptr, val); }
1769[[gnu::always_inline]] inline void store1(int16_t *ptr, int16x4_t val) { return vst1_s16(ptr, val); }
1770[[gnu::always_inline]] inline void store1(int32_t *ptr, int32x2_t val) { return vst1_s32(ptr, val); }
1771[[gnu::always_inline]] inline void store1(int64_t *ptr, int64x1_t val) { return vst1_s64(ptr, val); }
1772[[gnu::always_inline]] inline void store1(int64_t *ptr, int64x2_t val) { return vst1q_s64(ptr, val); }
1773[[gnu::always_inline]] inline void store1(uint8_t *ptr, uint8x8_t val) { return vst1_u8(ptr, val); }
1774[[gnu::always_inline]] inline void store1(uint16_t *ptr, uint16x4_t val) { return vst1_u16(ptr, val); }
1775[[gnu::always_inline]] inline void store1(uint32_t *ptr, uint32x2_t val) { return vst1_u32(ptr, val); }
1776[[gnu::always_inline]] inline void store1(uint64_t *ptr, uint64x1_t val) { return vst1_u64(ptr, val); }
1777[[gnu::always_inline]] inline void store1(uint64_t *ptr, uint64x2_t val) { return vst1q_u64(ptr, val); }
1778[[gnu::always_inline]] inline void store1(float32_t *ptr, float32x2_t val) { return vst1_f32(ptr, val); }
1779[[gnu::always_inline]] inline void store1(poly8_t *ptr, poly8x8_t val) { return vst1_p8(ptr, val); }
1780[[gnu::always_inline]] inline void store1(poly8_t *ptr, poly8x16_t val) { return vst1q_p8(ptr, val); }
1781[[gnu::always_inline]] inline void store1(poly16_t *ptr, poly16x4_t val) { return vst1_p16(ptr, val); }
1782[[gnu::always_inline]] inline void store1(poly16_t *ptr, poly16x8_t val) { return vst1q_p16(ptr, val); }
1783template <int lane>[[gnu::always_inline]] nce void store1_lane(int8_t *ptr, int8x8_t val) { return vst1_lane_s8(ptr, val, lane); }
1784template <int lane>[[gnu::always_inline]] nce void store1_lane(int8_t *ptr, int8x16_t val) { return vst1q_lane_s8(ptr, val, lane); }
1785template <int lane>[[gnu::always_inline]] nce void store1_lane(int16_t *ptr, int16x4_t val) { return vst1_lane_s16(ptr, val, lane); }
1786template <int lane>[[gnu::always_inline]] nce void store1_lane(int16_t *ptr, int16x8_t val) { return vst1q_lane_s16(ptr, val, lane); }
1787template <int lane>[[gnu::always_inline]] nce void store1_lane(int32_t *ptr, int32x2_t val) { return vst1_lane_s32(ptr, val, lane); }
1788template <int lane>[[gnu::always_inline]] nce void store1_lane(int32_t *ptr, int32x4_t val) { return vst1q_lane_s32(ptr, val, lane); }
1789template <int lane>[[gnu::always_inline]] nce void store1_lane(int64_t *ptr, int64x1_t val) { return vst1_lane_s64(ptr, val, lane); }
1790template <int lane>[[gnu::always_inline]] nce void store1_lane(int64_t *ptr, int64x2_t val) { return vst1q_lane_s64(ptr, val, lane); }
1791template <int lane>[[gnu::always_inline]] nce void store1_lane(uint8_t *ptr, uint8x8_t val) { return vst1_lane_u8(ptr, val, lane); }
1792template <int lane>[[gnu::always_inline]] nce void store1_lane(uint8_t *ptr, uint8x16_t val) { return vst1q_lane_u8(ptr, val, lane); }
1793template <int lane>[[gnu::always_inline]] nce void store1_lane(uint16_t *ptr, uint16x4_t val) { return vst1_lane_u16(ptr, val, lane); }
1794template <int lane>[[gnu::always_inline]] nce void store1_lane(uint16_t *ptr, uint16x8_t val) { return vst1q_lane_u16(ptr, val, lane); }
1795template <int lane>[[gnu::always_inline]] nce void store1_lane(uint32_t *ptr, uint32x2_t val) { return vst1_lane_u32(ptr, val, lane); }
1796template <int lane>[[gnu::always_inline]] nce void store1_lane(uint32_t *ptr, uint32x4_t val) { return vst1q_lane_u32(ptr, val, lane); }
1797template <int lane>[[gnu::always_inline]] nce void store1_lane(uint64_t *ptr, uint64x1_t val) { return vst1_lane_u64(ptr, val, lane); }
1798template <int lane>[[gnu::always_inline]] nce void store1_lane(uint64_t *ptr, uint64x2_t val) { return vst1q_lane_u64(ptr, val, lane); }
1799template <int lane>[[gnu::always_inline]] nce void store1_lane(float32_t *ptr, float32x2_t val) { return vst1_lane_f32(ptr, val, lane); }
1800template <int lane>[[gnu::always_inline]] nce void store1_lane(float32_t *ptr, float32x4_t val) { return vst1q_lane_f32(ptr, val, lane); }
1801template <int lane>[[gnu::always_inline]] nce void store1_lane(poly8_t *ptr, poly8x8_t val) { return vst1_lane_p8(ptr, val, lane); }
1802template <int lane>[[gnu::always_inline]] nce void store1_lane(poly8_t *ptr, poly8x16_t val) { return vst1q_lane_p8(ptr, val, lane); }
1803template <int lane>[[gnu::always_inline]] nce void store1_lane(poly16_t *ptr, poly16x4_t val) { return vst1_lane_p16(ptr, val, lane); }
1804template <int lane>[[gnu::always_inline]] nce void store1_lane(poly16_t *ptr, poly16x8_t val) { return vst1q_lane_p16(ptr, val, lane); }
1805[[gnu::always_inline]] inline void store2(int8_t *ptr, int8x8x2_t val) { return vst2_s8(ptr, val); }
1806[[gnu::always_inline]] inline void store2(int8_t *ptr, int8x16x2_t val) { return vst2q_s8(ptr, val); }
1807[[gnu::always_inline]] inline void store2(int16_t *ptr, int16x4x2_t val) { return vst2_s16(ptr, val); }
1808[[gnu::always_inline]] inline void store2(int16_t *ptr, int16x8x2_t val) { return vst2q_s16(ptr, val); }
1809[[gnu::always_inline]] inline void store2(int32_t *ptr, int32x2x2_t val) { return vst2_s32(ptr, val); }
1810[[gnu::always_inline]] inline void store2(int32_t *ptr, int32x4x2_t val) { return vst2q_s32(ptr, val); }
1811[[gnu::always_inline]] inline void store2(uint8_t *ptr, uint8x8x2_t val) { return vst2_u8(ptr, val); }
1812[[gnu::always_inline]] inline void store2(uint8_t *ptr, uint8x16x2_t val) { return vst2q_u8(ptr, val); }
1813[[gnu::always_inline]] inline void store2(uint16_t *ptr, uint16x4x2_t val) { return vst2_u16(ptr, val); }
1814[[gnu::always_inline]] inline void store2(uint16_t *ptr, uint16x8x2_t val) { return vst2q_u16(ptr, val); }
1815[[gnu::always_inline]] inline void store2(uint32_t *ptr, uint32x2x2_t val) { return vst2_u32(ptr, val); }
1816[[gnu::always_inline]] inline void store2(uint32_t *ptr, uint32x4x2_t val) { return vst2q_u32(ptr, val); }
1817[[gnu::always_inline]] inline void store2(float32_t *ptr, float32x2x2_t val) { return vst2_f32(ptr, val); }
1818[[gnu::always_inline]] inline void store2(float32_t *ptr, float32x4x2_t val) { return vst2q_f32(ptr, val); }
1819[[gnu::always_inline]] inline void store2(poly8_t *ptr, poly8x8x2_t val) { return vst2_p8(ptr, val); }
1820[[gnu::always_inline]] inline void store2(poly8_t *ptr, poly8x16x2_t val) { return vst2q_p8(ptr, val); }
1821[[gnu::always_inline]] inline void store2(poly16_t *ptr, poly16x4x2_t val) { return vst2_p16(ptr, val); }
1822[[gnu::always_inline]] inline void store2(poly16_t *ptr, poly16x8x2_t val) { return vst2q_p16(ptr, val); }
1823[[gnu::always_inline]] inline void store2(int64_t *ptr, int64x1x2_t val) { return vst2_s64(ptr, val); }
1824[[gnu::always_inline]] inline void store2(uint64_t *ptr, uint64x1x2_t val) { return vst2_u64(ptr, val); }
1825[[gnu::always_inline]] inline void store3(int8_t *ptr, int8x8x3_t val) { return vst3_s8(ptr, val); }
1826[[gnu::always_inline]] inline void store3(int8_t *ptr, int8x16x3_t val) { return vst3q_s8(ptr, val); }
1827[[gnu::always_inline]] inline void store3(int16_t *ptr, int16x4x3_t val) { return vst3_s16(ptr, val); }
1828[[gnu::always_inline]] inline void store3(int16_t *ptr, int16x8x3_t val) { return vst3q_s16(ptr, val); }
1829[[gnu::always_inline]] inline void store3(int32_t *ptr, int32x2x3_t val) { return vst3_s32(ptr, val); }
1830[[gnu::always_inline]] inline void store3(int32_t *ptr, int32x4x3_t val) { return vst3q_s32(ptr, val); }
1831[[gnu::always_inline]] inline void store3(uint8_t *ptr, uint8x8x3_t val) { return vst3_u8(ptr, val); }
1832[[gnu::always_inline]] inline void store3(uint8_t *ptr, uint8x16x3_t val) { return vst3q_u8(ptr, val); }
1833[[gnu::always_inline]] inline void store3(uint16_t *ptr, uint16x4x3_t val) { return vst3_u16(ptr, val); }
1834[[gnu::always_inline]] inline void store3(uint16_t *ptr, uint16x8x3_t val) { return vst3q_u16(ptr, val); }
1835[[gnu::always_inline]] inline void store3(uint32_t *ptr, uint32x2x3_t val) { return vst3_u32(ptr, val); }
1836[[gnu::always_inline]] inline void store3(uint32_t *ptr, uint32x4x3_t val) { return vst3q_u32(ptr, val); }
1837[[gnu::always_inline]] inline void store3(float32_t *ptr, float32x2x3_t val) { return vst3_f32(ptr, val); }
1838[[gnu::always_inline]] inline void store3(float32_t *ptr, float32x4x3_t val) { return vst3q_f32(ptr, val); }
1839[[gnu::always_inline]] inline void store3(poly8_t *ptr, poly8x8x3_t val) { return vst3_p8(ptr, val); }
1840[[gnu::always_inline]] inline void store3(poly8_t *ptr, poly8x16x3_t val) { return vst3q_p8(ptr, val); }
1841[[gnu::always_inline]] inline void store3(poly16_t *ptr, poly16x4x3_t val) { return vst3_p16(ptr, val); }
1842[[gnu::always_inline]] inline void store3(poly16_t *ptr, poly16x8x3_t val) { return vst3q_p16(ptr, val); }
1843[[gnu::always_inline]] inline void store3(int64_t *ptr, int64x1x3_t val) { return vst3_s64(ptr, val); }
1844[[gnu::always_inline]] inline void store3(uint64_t *ptr, uint64x1x3_t val) { return vst3_u64(ptr, val); }
1845[[gnu::always_inline]] inline void store4(int8_t *ptr, int8x8x4_t val) { return vst4_s8(ptr, val); }
1846[[gnu::always_inline]] inline void store4(int8_t *ptr, int8x16x4_t val) { return vst4q_s8(ptr, val); }
1847[[gnu::always_inline]] inline void store4(int16_t *ptr, int16x4x4_t val) { return vst4_s16(ptr, val); }
1848[[gnu::always_inline]] inline void store4(int16_t *ptr, int16x8x4_t val) { return vst4q_s16(ptr, val); }
1849[[gnu::always_inline]] inline void store4(int32_t *ptr, int32x2x4_t val) { return vst4_s32(ptr, val); }
1850[[gnu::always_inline]] inline void store4(int32_t *ptr, int32x4x4_t val) { return vst4q_s32(ptr, val); }
1851[[gnu::always_inline]] inline void store4(uint8_t *ptr, uint8x8x4_t val) { return vst4_u8(ptr, val); }
1852[[gnu::always_inline]] inline void store4(uint8_t *ptr, uint8x16x4_t val) { return vst4q_u8(ptr, val); }
1853[[gnu::always_inline]] inline void store4(uint16_t *ptr, uint16x4x4_t val) { return vst4_u16(ptr, val); }
1854[[gnu::always_inline]] inline void store4(uint16_t *ptr, uint16x8x4_t val) { return vst4q_u16(ptr, val); }
1855[[gnu::always_inline]] inline void store4(uint32_t *ptr, uint32x2x4_t val) { return vst4_u32(ptr, val); }
1856[[gnu::always_inline]] inline void store4(uint32_t *ptr, uint32x4x4_t val) { return vst4q_u32(ptr, val); }
1857[[gnu::always_inline]] inline void store4(float32_t *ptr, float32x2x4_t val) { return vst4_f32(ptr, val); }
1858[[gnu::always_inline]] inline void store4(float32_t *ptr, float32x4x4_t val) { return vst4q_f32(ptr, val); }
1859[[gnu::always_inline]] inline void store4(poly8_t *ptr, poly8x8x4_t val) { return vst4_p8(ptr, val); }
1860[[gnu::always_inline]] inline void store4(poly8_t *ptr, poly8x16x4_t val) { return vst4q_p8(ptr, val); }
1861[[gnu::always_inline]] inline void store4(poly16_t *ptr, poly16x4x4_t val) { return vst4_p16(ptr, val); }
1862[[gnu::always_inline]] inline void store4(poly16_t *ptr, poly16x8x4_t val) { return vst4q_p16(ptr, val); }
1863[[gnu::always_inline]] inline void store4(int64_t *ptr, int64x1x4_t val) { return vst4_s64(ptr, val); }
1864[[gnu::always_inline]] inline void store4(uint64_t *ptr, uint64x1x4_t val) { return vst4_u64(ptr, val); }
1865template <int lane>[[gnu::always_inline]] nce void store2_lane(int8_t *ptr, int8x8x2_t val) { return vst2_lane_s8(ptr, val, lane); }
1866template <int lane>[[gnu::always_inline]] nce void store2_lane(uint8_t *ptr, uint8x8x2_t val) { return vst2_lane_u8(ptr, val, lane); }
1867template <int lane>[[gnu::always_inline]] nce void store2_lane(poly8_t *ptr, poly8x8x2_t val) { return vst2_lane_p8(ptr, val, lane); }
1868template <int lane>[[gnu::always_inline]] nce void store3_lane(int8_t *ptr, int8x8x3_t val) { return vst3_lane_s8(ptr, val, lane); }
1869template <int lane>[[gnu::always_inline]] nce void store3_lane(uint8_t *ptr, uint8x8x3_t val) { return vst3_lane_u8(ptr, val, lane); }
1870template <int lane>[[gnu::always_inline]] nce void store3_lane(poly8_t *ptr, poly8x8x3_t val) { return vst3_lane_p8(ptr, val, lane); }
1871template <int lane>[[gnu::always_inline]] nce void store4_lane(int8_t *ptr, int8x8x4_t val) { return vst4_lane_s8(ptr, val, lane); }
1872template <int lane>[[gnu::always_inline]] nce void store4_lane(uint8_t *ptr, uint8x8x4_t val) { return vst4_lane_u8(ptr, val, lane); }
1873template <int lane>[[gnu::always_inline]] nce void store4_lane(poly8_t *ptr, poly8x8x4_t val) { return vst4_lane_p8(ptr, val, lane); }
1874template <int lane>[[gnu::always_inline]] nce void store2_lane(int16_t *ptr, int16x4x2_t val) { return vst2_lane_s16(ptr, val, lane); }
1875template <int lane>[[gnu::always_inline]] nce void store2_lane(int16_t *ptr, int16x8x2_t val) { return vst2q_lane_s16(ptr, val, lane); }
1876template <int lane>[[gnu::always_inline]] nce void store2_lane(int32_t *ptr, int32x2x2_t val) { return vst2_lane_s32(ptr, val, lane); }
1877template <int lane>[[gnu::always_inline]] nce void store2_lane(int32_t *ptr, int32x4x2_t val) { return vst2q_lane_s32(ptr, val, lane); }
1878template <int lane>[[gnu::always_inline]] nce void store2_lane(uint16_t *ptr, uint16x4x2_t val) { return vst2_lane_u16(ptr, val, lane); }
1879template <int lane>[[gnu::always_inline]] nce void store2_lane(uint16_t *ptr, uint16x8x2_t val) { return vst2q_lane_u16(ptr, val, lane); }
1880template <int lane>[[gnu::always_inline]] nce void store2_lane(uint32_t *ptr, uint32x2x2_t val) { return vst2_lane_u32(ptr, val, lane); }
1881template <int lane>[[gnu::always_inline]] nce void store2_lane(uint32_t *ptr, uint32x4x2_t val) { return vst2q_lane_u32(ptr, val, lane); }
1882template <int lane>[[gnu::always_inline]] nce void store2_lane(float32_t *ptr, float32x2x2_t val) { return vst2_lane_f32(ptr, val, lane); }
1883template <int lane>[[gnu::always_inline]] nce void store2_lane(float32_t *ptr, float32x4x2_t val) { return vst2q_lane_f32(ptr, val, lane); }
1884template <int lane>[[gnu::always_inline]] nce void store2_lane(poly16_t *ptr, poly16x4x2_t val) { return vst2_lane_p16(ptr, val, lane); }
1885template <int lane>[[gnu::always_inline]] nce void store2_lane(poly16_t *ptr, poly16x8x2_t val) { return vst2q_lane_p16(ptr, val, lane); }
1886template <int lane>[[gnu::always_inline]] nce void store3_lane(int16_t *ptr, int16x4x3_t val) { return vst3_lane_s16(ptr, val, lane); }
1887template <int lane>[[gnu::always_inline]] nce void store3_lane(int16_t *ptr, int16x8x3_t val) { return vst3q_lane_s16(ptr, val, lane); }
1888template <int lane>[[gnu::always_inline]] nce void store3_lane(int32_t *ptr, int32x2x3_t val) { return vst3_lane_s32(ptr, val, lane); }
1889template <int lane>[[gnu::always_inline]] nce void store3_lane(int32_t *ptr, int32x4x3_t val) { return vst3q_lane_s32(ptr, val, lane); }
1890template <int lane>[[gnu::always_inline]] nce void store3_lane(uint16_t *ptr, uint16x4x3_t val) { return vst3_lane_u16(ptr, val, lane); }
1891template <int lane>[[gnu::always_inline]] nce void store3_lane(uint16_t *ptr, uint16x8x3_t val) { return vst3q_lane_u16(ptr, val, lane); }
1892template <int lane>[[gnu::always_inline]] nce void store3_lane(uint32_t *ptr, uint32x2x3_t val) { return vst3_lane_u32(ptr, val, lane); }
1893template <int lane>[[gnu::always_inline]] nce void store3_lane(uint32_t *ptr, uint32x4x3_t val) { return vst3q_lane_u32(ptr, val, lane); }
1894template <int lane>[[gnu::always_inline]] nce void store3_lane(float32_t *ptr, float32x2x3_t val) { return vst3_lane_f32(ptr, val, lane); }
1895template <int lane>[[gnu::always_inline]] nce void store3_lane(float32_t *ptr, float32x4x3_t val) { return vst3q_lane_f32(ptr, val, lane); }
1896template <int lane>[[gnu::always_inline]] nce void store3_lane(poly16_t *ptr, poly16x4x3_t val) { return vst3_lane_p16(ptr, val, lane); }
1897template <int lane>[[gnu::always_inline]] nce void store3_lane(poly16_t *ptr, poly16x8x3_t val) { return vst3q_lane_p16(ptr, val, lane); }
1898// template <int lane>[[gnu::always_inline]] nce void store3_lane(int8_t *ptr, int8x16x3_t val) { return vst3q_lane_s8(ptr, val, lane); }
1899// template <int lane>[[gnu::always_inline]] nce void store3_lane(uint8_t *ptr, uint8x16x3_t val) { return vst3q_lane_u8(ptr, val, lane); }
1900// template <int lane>[[gnu::always_inline]] nce void store3_lane(poly8_t *ptr, poly8x16x3_t val) { return vst3q_lane_p8(ptr, val, lane); }
1901template <int lane>[[gnu::always_inline]] nce void store4_lane(int16_t *ptr, int16x4x4_t val) { return vst4_lane_s16(ptr, val, lane); }
1902template <int lane>[[gnu::always_inline]] nce void store4_lane(int16_t *ptr, int16x8x4_t val) { return vst4q_lane_s16(ptr, val, lane); }
1903template <int lane>[[gnu::always_inline]] nce void store4_lane(int32_t *ptr, int32x2x4_t val) { return vst4_lane_s32(ptr, val, lane); }
1904template <int lane>[[gnu::always_inline]] nce void store4_lane(int32_t *ptr, int32x4x4_t val) { return vst4q_lane_s32(ptr, val, lane); }
1905template <int lane>[[gnu::always_inline]] nce void store4_lane(uint16_t *ptr, uint16x4x4_t val) { return vst4_lane_u16(ptr, val, lane); }
1906template <int lane>[[gnu::always_inline]] nce void store4_lane(uint16_t *ptr, uint16x8x4_t val) { return vst4q_lane_u16(ptr, val, lane); }
1907template <int lane>[[gnu::always_inline]] nce void store4_lane(uint32_t *ptr, uint32x2x4_t val) { return vst4_lane_u32(ptr, val, lane); }
1908template <int lane>[[gnu::always_inline]] nce void store4_lane(uint32_t *ptr, uint32x4x4_t val) { return vst4q_lane_u32(ptr, val, lane); }
1909template <int lane>[[gnu::always_inline]] nce void store4_lane(float32_t *ptr, float32x2x4_t val) { return vst4_lane_f32(ptr, val, lane); }
1910template <int lane>[[gnu::always_inline]] nce void store4_lane(float32_t *ptr, float32x4x4_t val) { return vst4q_lane_f32(ptr, val, lane); }
1911template <int lane>[[gnu::always_inline]] nce void store4_lane(poly16_t *ptr, poly16x4x4_t val) { return vst4_lane_p16(ptr, val, lane); }
1912template <int lane>[[gnu::always_inline]] nce void store4_lane(poly16_t *ptr, poly16x8x4_t val) { return vst4q_lane_p16(ptr, val, lane); }
1913#if defined(__clang__) || (__GNUC__ > 13)
1914[[gnu::always_inline]] inline void store1_x2(int8_t *ptr, int8x8x2_t val) { return vst1_s8_x2(ptr, val); }
1915[[gnu::always_inline]] inline void store1_x2(int8_t *ptr, int8x16x2_t val) { return vst1q_s8_x2(ptr, val); }
1916[[gnu::always_inline]] inline void store1_x2(int16_t *ptr, int16x4x2_t val) { return vst1_s16_x2(ptr, val); }
1917[[gnu::always_inline]] inline void store1_x2(int16_t *ptr, int16x8x2_t val) { return vst1q_s16_x2(ptr, val); }
1918[[gnu::always_inline]] inline void store1_x2(int32_t *ptr, int32x2x2_t val) { return vst1_s32_x2(ptr, val); }
1919[[gnu::always_inline]] inline void store1_x2(int32_t *ptr, int32x4x2_t val) { return vst1q_s32_x2(ptr, val); }
1920[[gnu::always_inline]] inline void store1_x2(uint8_t *ptr, uint8x8x2_t val) { return vst1_u8_x2(ptr, val); }
1921[[gnu::always_inline]] inline void store1_x2(uint8_t *ptr, uint8x16x2_t val) { return vst1q_u8_x2(ptr, val); }
1922[[gnu::always_inline]] inline void store1_x2(uint16_t *ptr, uint16x4x2_t val) { return vst1_u16_x2(ptr, val); }
1923[[gnu::always_inline]] inline void store1_x2(uint16_t *ptr, uint16x8x2_t val) { return vst1q_u16_x2(ptr, val); }
1924[[gnu::always_inline]] inline void store1_x2(uint32_t *ptr, uint32x2x2_t val) { return vst1_u32_x2(ptr, val); }
1925[[gnu::always_inline]] inline void store1_x2(uint32_t *ptr, uint32x4x2_t val) { return vst1q_u32_x2(ptr, val); }
1926[[gnu::always_inline]] inline void store1_x2(float32_t *ptr, float32x2x2_t val) { return vst1_f32_x2(ptr, val); }
1927[[gnu::always_inline]] inline void store1_x2(float32_t *ptr, float32x4x2_t val) { return vst1q_f32_x2(ptr, val); }
1928[[gnu::always_inline]] inline void store1_x2(poly8_t *ptr, poly8x8x2_t val) { return vst1_p8_x2(ptr, val); }
1929[[gnu::always_inline]] inline void store1_x2(poly8_t *ptr, poly8x16x2_t val) { return vst1q_p8_x2(ptr, val); }
1930[[gnu::always_inline]] inline void store1_x2(poly16_t *ptr, poly16x4x2_t val) { return vst1_p16_x2(ptr, val); }
1931[[gnu::always_inline]] inline void store1_x2(poly16_t *ptr, poly16x8x2_t val) { return vst1q_p16_x2(ptr, val); }
1932[[gnu::always_inline]] inline void store1_x2(int64_t *ptr, int64x1x2_t val) { return vst1_s64_x2(ptr, val); }
1933[[gnu::always_inline]] inline void store1_x2(uint64_t *ptr, uint64x1x2_t val) { return vst1_u64_x2(ptr, val); }
1934[[gnu::always_inline]] inline void store1_x2(int64_t *ptr, int64x2x2_t val) { return vst1q_s64_x2(ptr, val); }
1935[[gnu::always_inline]] inline void store1_x2(uint64_t *ptr, uint64x2x2_t val) { return vst1q_u64_x2(ptr, val); }
1936[[gnu::always_inline]] inline void store1_x3(int8_t *ptr, int8x8x3_t val) { return vst1_s8_x3(ptr, val); }
1937[[gnu::always_inline]] inline void store1_x3(int8_t *ptr, int8x16x3_t val) { return vst1q_s8_x3(ptr, val); }
1938[[gnu::always_inline]] inline void store1_x3(int16_t *ptr, int16x4x3_t val) { return vst1_s16_x3(ptr, val); }
1939[[gnu::always_inline]] inline void store1_x3(int16_t *ptr, int16x8x3_t val) { return vst1q_s16_x3(ptr, val); }
1940[[gnu::always_inline]] inline void store1_x3(int32_t *ptr, int32x2x3_t val) { return vst1_s32_x3(ptr, val); }
1941[[gnu::always_inline]] inline void store1_x3(int32_t *ptr, int32x4x3_t val) { return vst1q_s32_x3(ptr, val); }
1942[[gnu::always_inline]] inline void store1_x3(uint8_t *ptr, uint8x8x3_t val) { return vst1_u8_x3(ptr, val); }
1943[[gnu::always_inline]] inline void store1_x3(uint8_t *ptr, uint8x16x3_t val) { return vst1q_u8_x3(ptr, val); }
1944[[gnu::always_inline]] inline void store1_x3(uint16_t *ptr, uint16x4x3_t val) { return vst1_u16_x3(ptr, val); }
1945[[gnu::always_inline]] inline void store1_x3(uint16_t *ptr, uint16x8x3_t val) { return vst1q_u16_x3(ptr, val); }
1946[[gnu::always_inline]] inline void store1_x3(uint32_t *ptr, uint32x2x3_t val) { return vst1_u32_x3(ptr, val); }
1947[[gnu::always_inline]] inline void store1_x3(uint32_t *ptr, uint32x4x3_t val) { return vst1q_u32_x3(ptr, val); }
1948[[gnu::always_inline]] inline void store1_x3(float32_t *ptr, float32x2x3_t val) { return vst1_f32_x3(ptr, val); }
1949[[gnu::always_inline]] inline void store1_x3(float32_t *ptr, float32x4x3_t val) { return vst1q_f32_x3(ptr, val); }
1950[[gnu::always_inline]] inline void store1_x3(poly8_t *ptr, poly8x8x3_t val) { return vst1_p8_x3(ptr, val); }
1951[[gnu::always_inline]] inline void store1_x3(poly8_t *ptr, poly8x16x3_t val) { return vst1q_p8_x3(ptr, val); }
1952[[gnu::always_inline]] inline void store1_x3(poly16_t *ptr, poly16x4x3_t val) { return vst1_p16_x3(ptr, val); }
1953[[gnu::always_inline]] inline void store1_x3(poly16_t *ptr, poly16x8x3_t val) { return vst1q_p16_x3(ptr, val); }
1954[[gnu::always_inline]] inline void store1_x3(int64_t *ptr, int64x1x3_t val) { return vst1_s64_x3(ptr, val); }
1955[[gnu::always_inline]] inline void store1_x3(uint64_t *ptr, uint64x1x3_t val) { return vst1_u64_x3(ptr, val); }
1956[[gnu::always_inline]] inline void store1_x3(int64_t *ptr, int64x2x3_t val) { return vst1q_s64_x3(ptr, val); }
1957[[gnu::always_inline]] inline void store1_x3(uint64_t *ptr, uint64x2x3_t val) { return vst1q_u64_x3(ptr, val); }
1958// [[gnu::always_inline]] inline void store1_x3(poly64_t *ptr, poly64x2x3_t val) { return vst1q_p64_x3(ptr, val); }
1959[[gnu::always_inline]] inline void store1_x4(int8_t *ptr, int8x8x4_t val) { return vst1_s8_x4(ptr, val); }
1960[[gnu::always_inline]] inline void store1_x4(int8_t *ptr, int8x16x4_t val) { return vst1q_s8_x4(ptr, val); }
1961[[gnu::always_inline]] inline void store1_x4(int16_t *ptr, int16x4x4_t val) { return vst1_s16_x4(ptr, val); }
1962[[gnu::always_inline]] inline void store1_x4(int16_t *ptr, int16x8x4_t val) { return vst1q_s16_x4(ptr, val); }
1963[[gnu::always_inline]] inline void store1_x4(int32_t *ptr, int32x2x4_t val) { return vst1_s32_x4(ptr, val); }
1964[[gnu::always_inline]] inline void store1_x4(int32_t *ptr, int32x4x4_t val) { return vst1q_s32_x4(ptr, val); }
1965[[gnu::always_inline]] inline void store1_x4(uint8_t *ptr, uint8x8x4_t val) { return vst1_u8_x4(ptr, val); }
1966[[gnu::always_inline]] inline void store1_x4(uint8_t *ptr, uint8x16x4_t val) { return vst1q_u8_x4(ptr, val); }
1967[[gnu::always_inline]] inline void store1_x4(uint16_t *ptr, uint16x4x4_t val) { return vst1_u16_x4(ptr, val); }
1968[[gnu::always_inline]] inline void store1_x4(uint16_t *ptr, uint16x8x4_t val) { return vst1q_u16_x4(ptr, val); }
1969[[gnu::always_inline]] inline void store1_x4(uint32_t *ptr, uint32x2x4_t val) { return vst1_u32_x4(ptr, val); }
1970[[gnu::always_inline]] inline void store1_x4(uint32_t *ptr, uint32x4x4_t val) { return vst1q_u32_x4(ptr, val); }
1971[[gnu::always_inline]] inline void store1_x4(float32_t *ptr, float32x2x4_t val) { return vst1_f32_x4(ptr, val); }
1972[[gnu::always_inline]] inline void store1_x4(float32_t *ptr, float32x4x4_t val) { return vst1q_f32_x4(ptr, val); }
1973[[gnu::always_inline]] inline void store1_x4(poly8_t *ptr, poly8x8x4_t val) { return vst1_p8_x4(ptr, val); }
1974[[gnu::always_inline]] inline void store1_x4(poly8_t *ptr, poly8x16x4_t val) { return vst1q_p8_x4(ptr, val); }
1975[[gnu::always_inline]] inline void store1_x4(poly16_t *ptr, poly16x4x4_t val) { return vst1_p16_x4(ptr, val); }
1976[[gnu::always_inline]] inline void store1_x4(poly16_t *ptr, poly16x8x4_t val) { return vst1q_p16_x4(ptr, val); }
1977[[gnu::always_inline]] inline void store1_x4(int64_t *ptr, int64x1x4_t val) { return vst1_s64_x4(ptr, val); }
1978[[gnu::always_inline]] inline void store1_x4(uint64_t *ptr, uint64x1x4_t val) { return vst1_u64_x4(ptr, val); }
1979[[gnu::always_inline]] inline void store1_x4(int64_t *ptr, int64x2x4_t val) { return vst1q_s64_x4(ptr, val); }
1980[[gnu::always_inline]] inline void store1_x4(uint64_t *ptr, uint64x2x4_t val) { return vst1q_u64_x4(ptr, val); }
1981#endif
1982[[gnu::always_inline]] nce int8x8_t table_lookup2(int8x8x2_t a, int8x8_t idx) { return vtbl2_s8(a, idx); }
1983[[gnu::always_inline]] nce uint8x8_t table_lookup2(uint8x8x2_t a, uint8x8_t idx) { return vtbl2_u8(a, idx); }
1984[[gnu::always_inline]] nce poly8x8_t table_lookup2(poly8x8x2_t a, uint8x8_t idx) { return vtbl2_p8(a, idx); }
1985[[gnu::always_inline]] nce int8x8_t table_lookup3(int8x8x3_t a, int8x8_t idx) { return vtbl3_s8(a, idx); }
1986[[gnu::always_inline]] nce uint8x8_t table_lookup3(uint8x8x3_t a, uint8x8_t idx) { return vtbl3_u8(a, idx); }
1987[[gnu::always_inline]] nce poly8x8_t table_lookup3(poly8x8x3_t a, uint8x8_t idx) { return vtbl3_p8(a, idx); }
1988[[gnu::always_inline]] nce int8x8_t table_lookup4(int8x8x4_t a, int8x8_t idx) { return vtbl4_s8(a, idx); }
1989[[gnu::always_inline]] nce uint8x8_t table_lookup4(uint8x8x4_t a, uint8x8_t idx) { return vtbl4_u8(a, idx); }
1990[[gnu::always_inline]] nce poly8x8_t table_lookup4(poly8x8x4_t a, uint8x8_t idx) { return vtbl4_p8(a, idx); }
1991#ifdef __clang__
1992[[gnu::always_inline]] nce poly64x1_t add(poly64x1_t a, poly64x1_t b) { return vadd_p64(a, b); }
1993[[gnu::always_inline]] nce poly8x16_t add(poly8x16_t a, poly8x16_t b) { return vaddq_p8(a, b); }
1994[[gnu::always_inline]] nce poly16x8_t add(poly16x8_t a, poly16x8_t b) { return vaddq_p16(a, b); }
1995[[gnu::always_inline]] nce poly64x2_t add(poly64x2_t a, poly64x2_t b) { return vaddq_p64(a, b); }
1996// [[gnu::always_inline]] nce poly128_t add(poly128_t a, poly128_t b) { return vaddq_p128(a, b); }
1997#endif
1998// clang-format on
1999
2000} // namespace neon
2001#undef nce
2002
2003#endif // __cplusplus