Argon 0.1.0
Loading...
Searching...
No Matches
a64.hpp
1#pragma once
2#include "a32.hpp"
3
4#ifdef __clang__
5#define nce constexpr
6#else
7#define nce inline
8#endif
9
10#ifdef __cplusplus
11namespace neon {
12// clang-format off
13template <typename T> nce T max(float64x1_t a, float64x1_t b);
14template <typename T> nce T max(float64x2_t a, float64x2_t b);
15template <typename T> nce T min(float64x1_t a, float64x1_t b);
16template <typename T> nce T min(float64x2_t a, float64x2_t b);
17template <typename T> nce T max_strict(float64x1_t a, float64x1_t b);
18template <typename T> nce T max_strict(float64x2_t a, float64x2_t b);
19template <typename T> nce T min_strict(float64x1_t a, float64x1_t b);
20template <typename T> nce T min_strict(float64x2_t a, float64x2_t b);
21
22template <typename T> nce T shift_right_saturate_narrow(int16_t a);
23template <typename T> nce T shift_right_saturate_narrow(int32_t a);
24template <typename T> nce T shift_right_saturate_narrow(int64_t a);
25template <typename T> nce T shift_right_saturate_narrow(uint16_t a);
26template <typename T> nce T shift_right_saturate_narrow(uint32_t a);
27template <typename T> nce T shift_right_saturate_narrow(uint64_t a);
28template <typename T> nce T shift_right_saturate_narrow(int8x8_t r, int16x8_t a);
29template <typename T> nce T shift_right_saturate_narrow(int16x4_t r, int32x4_t a);
30template <typename T> nce T shift_right_saturate_narrow(int32x2_t r, int64x2_t a);
31template <typename T> nce T shift_right_saturate_narrow(uint8x8_t r, uint16x8_t a);
32template <typename T> nce T shift_right_saturate_narrow(uint16x4_t r, uint32x4_t a);
33template <typename T> nce T shift_right_saturate_narrow(uint32x2_t r, uint64x2_t a);
34template <typename T> nce T convert(float32_t a);
35template <typename T> nce T convert_round_to_nearest_with_ties_to_even(float32_t a);
36template <typename T> nce T convert_round_toward_negative_infinity(float32_t a);
37template <typename T> nce T convert_round_toward_positive_infinity(float32_t a);
38template <typename T> nce T convert_round_to_nearest_with_ties_away_from_zero(float32_t a);
39template <typename T> nce T convert(float64x1_t a);
40template <typename T> nce T convert(float64x2_t a);
41template <typename T> nce T convert_round_to_nearest_with_ties_to_even(float64x1_t a);
42template <typename T> nce T convert_round_to_nearest_with_ties_to_even(float64x2_t a);
43template <typename T> nce T convert_round_toward_negative_infinity(float64x1_t a);
44template <typename T> nce T convert_round_toward_negative_infinity(float64x2_t a);
45template <typename T> nce T convert_round_toward_positive_infinity(float64x1_t a);
46template <typename T> nce T convert_round_toward_positive_infinity(float64x2_t a);
47template <typename T> nce T convert_round_to_nearest_with_ties_away_from_zero(float64x1_t a);
48template <typename T> nce T convert_round_to_nearest_with_ties_away_from_zero(float64x2_t a);
49template <typename T> nce T convert(float64_t a);
50template <typename T> nce T convert_round_to_nearest_with_ties_to_even(float64_t a);
51template <typename T> nce T convert_round_toward_negative_infinity(float64_t a);
52template <typename T> nce T convert_round_toward_positive_infinity(float64_t a);
53template <typename T> nce T convert_round_to_nearest_with_ties_away_from_zero(float64_t a);
54template <typename T> nce T convert(int32_t a);
55template <typename T> nce T convert(uint32_t a);
56template <typename T> nce T convert(int64x1_t a);
57template <typename T> nce T convert(int64x2_t a);
58template <typename T> nce T convert(uint64x1_t a);
59template <typename T> nce T convert(uint64x2_t a);
60template <typename T> nce T convert(int64_t a);
61template <typename T> nce T convert(uint64_t a);
62template <typename T> nce T reinterpret(int8x8_t a);
63template <typename T> nce T reinterpret(int16x4_t a);
64template <typename T> nce T reinterpret(int32x2_t a);
65template <typename T> nce T reinterpret(float32x2_t a);
66template <typename T> nce T reinterpret(float64x1_t a);
67template <typename T> nce T reinterpret(uint8x8_t a);
68template <typename T> nce T reinterpret(uint16x4_t a);
69template <typename T> nce T reinterpret(uint32x2_t a);
70template <typename T> nce T reinterpret(poly16x4_t a);
71template <typename T> nce T reinterpret(uint64x1_t a);
72template <typename T> nce T reinterpret(int64x1_t a);
73template <typename T> nce T reinterpret(float16x4_t a);
74template <typename T> nce T reinterpret(int8x16_t a);
75template <typename T> nce T reinterpret(int16x8_t a);
76template <typename T> nce T reinterpret(int32x4_t a);
77template <typename T> nce T reinterpret(float32x4_t a);
78template <typename T> nce T reinterpret(float64x2_t a);
79template <typename T> nce T reinterpret(uint8x16_t a);
80template <typename T> nce T reinterpret(uint16x8_t a);
81template <typename T> nce T reinterpret(uint32x4_t a);
82template <typename T> nce T reinterpret(poly16x8_t a);
83template <typename T> nce T reinterpret(int64x2_t a);
84template <typename T> nce T reinterpret(float16x8_t a);
85template <typename T> nce T reinterpret(poly64x2_t a);
86template <typename T> nce T reinterpret(poly128_t a);
87template <typename T> nce T create(uint64_t a);
88template <typename T> nce T duplicate(float64_t value);
89template <typename T> nce T move(float64_t value);
90template <typename T> nce T duplicate(float64x1_t vec);
91template <typename T> nce T duplicate(int8x16_t vec);
92template <typename T> nce T duplicate(int16x8_t vec);
93template <typename T> nce T duplicate(int32x4_t vec);
94template <typename T> nce T duplicate(int64x2_t vec);
95template <typename T> nce T duplicate(uint8x16_t vec);
96template <typename T> nce T duplicate(uint16x8_t vec);
97template <typename T> nce T duplicate(uint32x4_t vec);
98template <typename T> nce T duplicate(uint64x2_t vec);
99template <typename T> nce T duplicate(poly64x2_t vec);
100template <typename T> nce T duplicate(float32x4_t vec);
101template <typename T> nce T duplicate(poly8x16_t vec);
102template <typename T> nce T duplicate(poly16x8_t vec);
103template <typename T> nce T duplicate(float64x2_t vec);
104template <typename T> nce T get(float64x2_t a);
105template <typename T> nce T load1(float64_t const *ptr);
106template <typename T> nce T load1_duplicate(float64_t const *ptr);
107template <typename T> nce T load2(float64_t const *ptr);
108template <typename T> nce T load3(float64_t const *ptr);
109template <typename T> nce T load3(int8_t const *ptr);
110template <typename T> nce T load4(float64_t const *ptr);
111template <typename T> nce T load2_duplicate(float64_t const *ptr);
112template <typename T> nce T load3_duplicate(float64_t const *ptr);
113template <typename T> nce T load4_duplicate(float64_t const *ptr);
114template <typename T> nce T load1_x2(float64_t const *ptr);
115template <typename T> nce T load1_x3(float64_t const *ptr);
116template <typename T> nce T load1_x4(float64_t const *ptr);
117template <typename T> nce T store1(float64_t *ptr, float64x1_t val);
118template <typename T> nce T store1(float64_t *ptr, float64x2_t val);
119template <typename T> nce T store2(int64_t *ptr, int64x2x2_t val);
120template <typename T> nce T store2(uint64_t *ptr, uint64x2x2_t val);
121template <typename T> nce T store2(poly64_t *ptr, poly64x2x2_t val);
122template <typename T> nce T store2(float64_t *ptr, float64x1x2_t val);
123template <typename T> nce T store2(float64_t *ptr, float64x2x2_t val);
124template <typename T> nce T store3(int64_t *ptr, int64x2x3_t val);
125template <typename T> nce T store3(uint64_t *ptr, uint64x2x3_t val);
126template <typename T> nce T store3(poly64_t *ptr, poly64x2x3_t val);
127template <typename T> nce T store3(float64_t *ptr, float64x1x3_t val);
128template <typename T> nce T store3(float64_t *ptr, float64x2x3_t val);
129template <typename T> nce T store4(int64_t *ptr, int64x2x4_t val);
130template <typename T> nce T store4(uint64_t *ptr, uint64x2x4_t val);
131template <typename T> nce T store4(poly64_t *ptr, poly64x2x4_t val);
132template <typename T> nce T store4(float64_t *ptr, float64x1x4_t val);
133template <typename T> nce T store4(float64_t *ptr, float64x2x4_t val);
134template <typename T> nce T convert(int16_t a);
135template <typename T> nce T convert(uint16_t a);
136template <typename T> nce T convert(float16_t a);
137template <typename T> nce T convert_round_to_nearest_with_ties_away_from_zero(float16_t a);
138template <typename T> nce T convert_round_toward_negative_infinity(float16_t a);
139template <typename T> nce T convert_round_to_nearest_with_ties_to_even(float16_t a);
140template <typename T> nce T convert_round_toward_positive_infinity(float16_t a);
141template <typename T> nce T duplicate(float16x8_t vec);
142template <typename T> nce T reinterpret(bfloat16x4_t a);
143template <typename T> nce T reinterpret(bfloat16x8_t a);
144template <typename T> nce T get_high(float64x2_t a);
145template <typename T> nce T get_low(float64x2_t a);
146
147
148
149[[gnu::always_inline]] nce uint8x8_t add_saturate(uint8x8_t a, int8x8_t b) { return vsqadd_u8(a, b); }
150[[gnu::always_inline]] nce uint8x16_t add_narrow_high_high(uint8x8_t r, uint16x8_t a, uint16x8_t b) { return vaddhn_high_u16(r, a, b); }
151[[gnu::always_inline]] nce uint8x16_t add_round_narrow_high_high(uint8x8_t r, uint16x8_t a, uint16x8_t b) { return vraddhn_high_u16(r, a, b); }
152[[gnu::always_inline]] nce uint8x16_t subtract_narrow_high_high(uint8x8_t r, uint16x8_t a, uint16x8_t b) { return vsubhn_high_u16(r, a, b); }
153[[gnu::always_inline]] nce uint8x16_t subtract_round_narrow_high_high(uint8x8_t r, uint16x8_t a, uint16x8_t b) { return vrsubhn_high_u16(r, a, b); }
154[[gnu::always_inline]] nce uint8_t reduce_add(uint8x8_t a) { return vaddv_u8(a); }
155[[gnu::always_inline]] nce uint16_t reduce_add_long(uint8x8_t a) { return vaddlv_u8(a); }
156[[gnu::always_inline]] inline uint8_t reduce_max(uint8x8_t a) { return vmaxv_u8(a); }
157[[gnu::always_inline]] inline uint8_t reduce_min(uint8x8_t a) { return vminv_u8(a); }
158[[gnu::always_inline]] nce uint8x8_t equal_to_zero(uint8x8_t a) { return vceqz_u8(a); }
159template <int n>[[gnu::always_inline]] nce uint8x16_t shift_right_narrow_high(uint8x8_t r, uint16x8_t a) { return vshrn_high_n_u16(r, a, n); }
160template <int n>[[gnu::always_inline]] nce uint8x16_t shift_right_saturate_narrow_high(uint8x8_t r, uint16x8_t a) { return vqshrn_high_n_u16(r, a, n); }
161template <int n>[[gnu::always_inline]] nce uint8x16_t shift_right_round_saturate_narrow_high(uint8x8_t r, uint16x8_t a) { return vqrshrn_high_n_u16(r, a, n); }
162template <int n>[[gnu::always_inline]] nce uint8x16_t shift_right_round_narrow_high(uint8x8_t r, uint16x8_t a) { return vrshrn_high_n_u16(r, a, n); }
163template <int n>[[gnu::always_inline]] nce uint8x16_t shift_right_saturate_narrow_unsigned_high(uint8x8_t r, int16x8_t a) { return vqshrun_high_n_s16(r, a, n); }
164template <int n>[[gnu::always_inline]] nce uint8x16_t shift_right_unsigned_saturate_narrow_high(uint8x8_t r, int16x8_t a) { return vqrshrun_high_n_s16(r, a, n); }
165template <> [[gnu::always_inline]] nce float64x1_t reinterpret(uint8x8_t a) { return vreinterpret_f64_u8(a); }
166[[gnu::always_inline]] nce uint8x8_t zip2(uint8x8_t a, uint8x8_t b) { return vzip2_u8(a, b); }
167template <int lane1, int lane2>[[gnu::always_inline]] nce uint8x8_t copy_lane(uint8x8_t a, uint8x8_t b) { return vcopy_lane_u8(a, lane1, b, lane2); }
168template <int lane1, int lane2>[[gnu::always_inline]] nce uint8x8_t copy_lane(uint8x8_t a, uint8x16_t b) { return vcopy_laneq_u8(a, lane1, b, lane2); }
169[[gnu::always_inline]] nce uint8x8_t reverse_bits(uint8x8_t a) { return vrbit_u8(a); }
170[[gnu::always_inline]] nce uint8x8_t zip1(uint8x8_t a, uint8x8_t b) { return vzip1_u8(a, b); }
171[[gnu::always_inline]] nce uint8x8_t unzip1(uint8x8_t a, uint8x8_t b) { return vuzp1_u8(a, b); }
172[[gnu::always_inline]] nce uint8x8_t unzip2(uint8x8_t a, uint8x8_t b) { return vuzp2_u8(a, b); }
173[[gnu::always_inline]] nce uint8x8_t transpose_step_1(uint8x8_t a, uint8x8_t b) { return vtrn1_u8(a, b); }
174[[gnu::always_inline]] nce uint8x8_t transpose_step_2(uint8x8_t a, uint8x8_t b) { return vtrn2_u8(a, b); }
175[[gnu::always_inline]] nce uint8x8_t table_extend1_saturate(uint8x8_t a, uint8x16_t t, uint8x8_t idx) { return vqtbx1_u8(a, t, idx); }
176[[gnu::always_inline]] nce uint8x16_t move_saturate_narrow_high(uint8x8_t r, uint16x8_t a) { return vqmovn_high_u16(r, a); }
177[[gnu::always_inline]] nce uint8x16_t move_unsigned_saturate_narrow_high(uint8x8_t r, int16x8_t a) { return vqmovun_high_s16(r, a); }
178[[gnu::always_inline]] nce uint8x8_t table_extend2_saturate(uint8x8_t a, uint8x16x2_t t, uint8x8_t idx) { return vqtbx2_u8(a, t, idx); }
179[[gnu::always_inline]] nce uint8x8_t table_extend3_saturate(uint8x8_t a, uint8x16x3_t t, uint8x8_t idx) { return vqtbx3_u8(a, t, idx); }
180[[gnu::always_inline]] nce uint8x8_t table_extend4_saturate(uint8x8_t a, uint8x16x4_t t, uint8x8_t idx) { return vqtbx4_u8(a, t, idx); }
181[[gnu::always_inline]] nce uint16x8_t add_long_high(uint8x16_t a, uint8x16_t b) { return vaddl_high_u8(a, b); }
182[[gnu::always_inline]] nce uint16x8_t multiply_long_high(uint8x16_t a, uint8x16_t b) { return vmull_high_u8(a, b); }
183[[gnu::always_inline]] nce uint16x8_t subtract_long_high(uint8x16_t a, uint8x16_t b) { return vsubl_high_u8(a, b); }
184[[gnu::always_inline]] nce uint16x8_t subtract_absolute_long_high(uint8x16_t a, uint8x16_t b) { return vabdl_high_u8(a, b); }
185[[gnu::always_inline]] nce uint8x16_t pairwise_add(uint8x16_t a, uint8x16_t b) { return vpaddq_u8(a, b); }
186[[gnu::always_inline]] nce uint8x16_t pairwise_max(uint8x16_t a, uint8x16_t b) { return vpmaxq_u8(a, b); }
187[[gnu::always_inline]] nce uint8x16_t pairwise_min(uint8x16_t a, uint8x16_t b) { return vpminq_u8(a, b); }
188[[gnu::always_inline]] nce uint8x16_t add_saturate(uint8x16_t a, int8x16_t b) { return vsqaddq_u8(a, b); }
189[[gnu::always_inline]] nce uint8_t reduce_add(uint8x16_t a) { return vaddvq_u8(a); }
190[[gnu::always_inline]] nce uint16_t reduce_add_long(uint8x16_t a) { return vaddlvq_u8(a); }
191[[gnu::always_inline]] inline uint8_t reduce_max(uint8x16_t a) { return vmaxvq_u8(a); }
192[[gnu::always_inline]] inline uint8_t reduce_min(uint8x16_t a) { return vminvq_u8(a); }
193[[gnu::always_inline]] nce uint8x16_t equal_to_zero(uint8x16_t a) { return vceqzq_u8(a); }
194template <int n>[[gnu::always_inline]] nce uint16x8_t shift_left_long_high(uint8x16_t a) { return vshll_high_n_u8(a, n); }
195template <> [[gnu::always_inline]] nce float64x2_t reinterpret(uint8x16_t a) { return vreinterpretq_f64_u8(a); }
196[[gnu::always_inline]] nce uint16x8_t move_long_high(uint8x16_t a) { return vmovl_high_u8(a); }
197[[gnu::always_inline]] nce uint8x16_t reverse_bits(uint8x16_t a) { return vrbitq_u8(a); }
198template <int lane>[[gnu::always_inline]] nce uint8x8_t duplicate_lane(uint8x16_t vec) { return vdup_laneq_u8(vec, lane); }
199template <int lane>[[gnu::always_inline]] nce uint8x16_t duplicate_lane(uint8x16_t vec) { return vdupq_laneq_u8(vec, lane); }
200[[gnu::always_inline]] nce uint8x8_t table_lookup1_saturate(uint8x16_t t, uint8x8_t idx) { return vqtbl1_u8(t, idx); }
201[[gnu::always_inline]] nce uint8x16_t zip1(uint8x16_t a, uint8x16_t b) { return vzip1q_u8(a, b); }
202[[gnu::always_inline]] nce uint8x16_t zip2(uint8x16_t a, uint8x16_t b) { return vzip2q_u8(a, b); }
203[[gnu::always_inline]] nce uint8x16_t unzip1(uint8x16_t a, uint8x16_t b) { return vuzp1q_u8(a, b); }
204[[gnu::always_inline]] nce uint8x16_t unzip2(uint8x16_t a, uint8x16_t b) { return vuzp2q_u8(a, b); }
205[[gnu::always_inline]] nce uint8x16_t transpose_step_1(uint8x16_t a, uint8x16_t b) { return vtrn1q_u8(a, b); }
206[[gnu::always_inline]] nce uint8x16_t transpose_step_2(uint8x16_t a, uint8x16_t b) { return vtrn2q_u8(a, b); }
207[[gnu::always_inline]] nce uint8x16_t table_lookup1_saturate(uint8x16_t t, uint8x16_t idx) { return vqtbl1q_u8(t, idx); }
208[[gnu::always_inline]] nce uint8x16_t table_extend1_saturate(uint8x16_t a, uint8x16_t t, uint8x16_t idx) { return vqtbx1q_u8(a, t, idx); }
209
210
211template <int lane1, int lane2>[[gnu::always_inline]] nce uint8x16_t copy_lane(uint8x16_t a, uint8x8_t b) { return vcopyq_lane_u8(a, lane1, b, lane2); }
212template <int lane1, int lane2>[[gnu::always_inline]] nce uint8x16_t copy_lane(uint8x16_t a, uint8x16_t b) { return vcopyq_laneq_u8(a, lane1, b, lane2); }
213[[gnu::always_inline]] nce uint8x16_t table_extend2_saturate(uint8x16_t a, uint8x16x2_t t, uint8x16_t idx) { return vqtbx2q_u8(a, t, idx); }
214[[gnu::always_inline]] nce uint8x16_t table_extend3_saturate(uint8x16_t a, uint8x16x3_t t, uint8x16_t idx) { return vqtbx3q_u8(a, t, idx); }
215[[gnu::always_inline]] nce uint8x16_t table_extend4_saturate(uint8x16_t a, uint8x16x4_t t, uint8x16_t idx) { return vqtbx4q_u8(a, t, idx); }
216[[gnu::always_inline]] nce int8x8_t add_saturate(int8x8_t a, uint8x8_t b) { return vuqadd_s8(a, b); }
217[[gnu::always_inline]] nce int8x16_t add_narrow_high_high(int8x8_t r, int16x8_t a, int16x8_t b) { return vaddhn_high_s16(r, a, b); }
218[[gnu::always_inline]] nce int8x16_t add_round_narrow_high_high(int8x8_t r, int16x8_t a, int16x8_t b) { return vraddhn_high_s16(r, a, b); }
219[[gnu::always_inline]] nce int8x16_t subtract_narrow_high_high(int8x8_t r, int16x8_t a, int16x8_t b) { return vsubhn_high_s16(r, a, b); }
220[[gnu::always_inline]] nce int8x16_t subtract_round_narrow_high_high(int8x8_t r, int16x8_t a, int16x8_t b) { return vrsubhn_high_s16(r, a, b); }
221[[gnu::always_inline]] nce int8_t reduce_add(int8x8_t a) { return vaddv_s8(a); }
222[[gnu::always_inline]] nce int16_t reduce_add_long(int8x8_t a) { return vaddlv_s8(a); }
223[[gnu::always_inline]] inline int8_t reduce_max(int8x8_t a) { return vmaxv_s8(a); }
224[[gnu::always_inline]] inline int8_t reduce_min(int8x8_t a) { return vminv_s8(a); }
225[[gnu::always_inline]] nce uint8x8_t equal_to_zero(int8x8_t a) { return vceqz_s8(a); }
226[[gnu::always_inline]] nce uint8x8_t greater_than_or_equal_to_zero(int8x8_t a) { return vcgez_s8(a); }
227[[gnu::always_inline]] nce uint8x8_t less_than_or_equal_to_zero(int8x8_t a) { return vclez_s8(a); }
228[[gnu::always_inline]] nce uint8x8_t greater_than_zero(int8x8_t a) { return vcgtz_s8(a); }
229[[gnu::always_inline]] nce uint8x8_t less_than_zero(int8x8_t a) { return vcltz_s8(a); }
230template <int n>[[gnu::always_inline]] nce int8x16_t shift_right_narrow_high(int8x8_t r, int16x8_t a) { return vshrn_high_n_s16(r, a, n); }
231template <int n>[[gnu::always_inline]] nce int8x16_t shift_right_saturate_narrow_high(int8x8_t r, int16x8_t a) { return vqshrn_high_n_s16(r, a, n); }
232template <int n>[[gnu::always_inline]] nce int8x16_t shift_right_round_saturate_narrow_high(int8x8_t r, int16x8_t a) { return vqrshrn_high_n_s16(r, a, n); }
233template <int n>[[gnu::always_inline]] nce int8x16_t shift_right_round_narrow_high(int8x8_t r, int16x8_t a) { return vrshrn_high_n_s16(r, a, n); }
234template <> [[gnu::always_inline]] nce float64x1_t reinterpret(int8x8_t a) { return vreinterpret_f64_s8(a); }
235[[gnu::always_inline]] nce int8x8_t reverse_bits(int8x8_t a) { return vrbit_s8(a); }
236[[gnu::always_inline]] nce int8x8_t zip1(int8x8_t a, int8x8_t b) { return vzip1_s8(a, b); }
237[[gnu::always_inline]] nce int8x8_t zip2(int8x8_t a, int8x8_t b) { return vzip2_s8(a, b); }
238[[gnu::always_inline]] nce int8x8_t unzip1(int8x8_t a, int8x8_t b) { return vuzp1_s8(a, b); }
239[[gnu::always_inline]] nce int8x8_t unzip2(int8x8_t a, int8x8_t b) { return vuzp2_s8(a, b); }
240[[gnu::always_inline]] nce int8x8_t transpose_step_1(int8x8_t a, int8x8_t b) { return vtrn1_s8(a, b); }
241[[gnu::always_inline]] nce int8x8_t transpose_step_2(int8x8_t a, int8x8_t b) { return vtrn2_s8(a, b); }
242[[gnu::always_inline]] nce int8x8_t table_extend1_saturate(int8x8_t a, int8x16_t t, uint8x8_t idx) { return vqtbx1_s8(a, t, idx); }
243[[gnu::always_inline]] nce int8x16_t move_saturate_narrow_high(int8x8_t r, int16x8_t a) { return vqmovn_high_s16(r, a); }
244template <int lane1, int lane2>[[gnu::always_inline]] nce int8x8_t copy_lane(int8x8_t a, int8x8_t b) { return vcopy_lane_s8(a, lane1, b, lane2); }
245template <int lane1, int lane2>[[gnu::always_inline]] nce int8x8_t copy_lane(int8x8_t a, int8x16_t b) { return vcopy_laneq_s8(a, lane1, b, lane2); }
246[[gnu::always_inline]] nce int8x8_t table_extend2_saturate(int8x8_t a, int8x16x2_t t, uint8x8_t idx) { return vqtbx2_s8(a, t, idx); }
247[[gnu::always_inline]] nce int8x8_t table_extend3_saturate(int8x8_t a, int8x16x3_t t, uint8x8_t idx) { return vqtbx3_s8(a, t, idx); }
248[[gnu::always_inline]] nce int8x8_t table_extend4_saturate(int8x8_t a, int8x16x4_t t, uint8x8_t idx) { return vqtbx4_s8(a, t, idx); }
249[[gnu::always_inline]] nce int8x16_t add_saturate(int8x16_t a, uint8x16_t b) { return vuqaddq_s8(a, b); }
250[[gnu::always_inline]] nce int16x8_t add_long_high(int8x16_t a, int8x16_t b) { return vaddl_high_s8(a, b); }
251[[gnu::always_inline]] nce int16x8_t multiply_long_high(int8x16_t a, int8x16_t b) { return vmull_high_s8(a, b); }
252[[gnu::always_inline]] nce int16x8_t subtract_long_high(int8x16_t a, int8x16_t b) { return vsubl_high_s8(a, b); }
253[[gnu::always_inline]] nce int16x8_t subtract_absolute_long_high(int8x16_t a, int8x16_t b) { return vabdl_high_s8(a, b); }
254[[gnu::always_inline]] nce int8x16_t pairwise_add(int8x16_t a, int8x16_t b) { return vpaddq_s8(a, b); }
255[[gnu::always_inline]] nce int8x16_t pairwise_max(int8x16_t a, int8x16_t b) { return vpmaxq_s8(a, b); }
256[[gnu::always_inline]] nce int8x16_t pairwise_min(int8x16_t a, int8x16_t b) { return vpminq_s8(a, b); }
257[[gnu::always_inline]] nce int8_t reduce_add(int8x16_t a) { return vaddvq_s8(a); }
258[[gnu::always_inline]] nce int16_t reduce_add_long(int8x16_t a) { return vaddlvq_s8(a); }
259[[gnu::always_inline]] inline int8_t reduce_max(int8x16_t a) { return vmaxvq_s8(a); }
260[[gnu::always_inline]] inline int8_t reduce_min(int8x16_t a) { return vminvq_s8(a); }
261[[gnu::always_inline]] nce uint8x16_t equal_to_zero(int8x16_t a) { return vceqzq_s8(a); }
262[[gnu::always_inline]] nce uint8x16_t greater_than_or_equal_to_zero(int8x16_t a) { return vcgezq_s8(a); }
263[[gnu::always_inline]] nce uint8x16_t less_than_or_equal_to_zero(int8x16_t a) { return vclezq_s8(a); }
264[[gnu::always_inline]] nce uint8x16_t greater_than_zero(int8x16_t a) { return vcgtzq_s8(a); }
265[[gnu::always_inline]] nce uint8x16_t less_than_zero(int8x16_t a) { return vcltzq_s8(a); }
266template <int n>[[gnu::always_inline]] nce int16x8_t shift_left_long_high(int8x16_t a) { return vshll_high_n_s8(a, n); }
267template <> [[gnu::always_inline]] nce float64x2_t reinterpret(int8x16_t a) { return vreinterpretq_f64_s8(a); }
268[[gnu::always_inline]] nce int16x8_t move_long_high(int8x16_t a) { return vmovl_high_s8(a); }
269[[gnu::always_inline]] nce int8x16_t reverse_bits(int8x16_t a) { return vrbitq_s8(a); }
270template <int lane>[[gnu::always_inline]] nce int8x8_t duplicate_lane(int8x16_t vec) { return vdup_laneq_s8(vec, lane); }
271template <int lane>[[gnu::always_inline]] nce int8x16_t duplicate_lane(int8x16_t vec) { return vdupq_laneq_s8(vec, lane); }
272[[gnu::always_inline]] nce int8x8_t table_lookup1_saturate(int8x16_t t, uint8x8_t idx) { return vqtbl1_s8(t, idx); }
273[[gnu::always_inline]] nce int8x16_t table_lookup1_saturate(int8x16_t t, uint8x16_t idx) { return vqtbl1q_s8(t, idx); }
274[[gnu::always_inline]] nce int8x16_t zip1(int8x16_t a, int8x16_t b) { return vzip1q_s8(a, b); }
275[[gnu::always_inline]] nce int8x16_t zip2(int8x16_t a, int8x16_t b) { return vzip2q_s8(a, b); }
276[[gnu::always_inline]] nce int8x16_t unzip1(int8x16_t a, int8x16_t b) { return vuzp1q_s8(a, b); }
277[[gnu::always_inline]] nce int8x16_t unzip2(int8x16_t a, int8x16_t b) { return vuzp2q_s8(a, b); }
278[[gnu::always_inline]] nce int8x16_t transpose_step_1(int8x16_t a, int8x16_t b) { return vtrn1q_s8(a, b); }
279[[gnu::always_inline]] nce int8x16_t transpose_step_2(int8x16_t a, int8x16_t b) { return vtrn2q_s8(a, b); }
280[[gnu::always_inline]] nce int8x16_t table_extend1_saturate(int8x16_t a, int8x16_t t, uint8x16_t idx) { return vqtbx1q_s8(a, t, idx); }
281
282
283template <int lane1, int lane2>[[gnu::always_inline]] nce int8x16_t copy_lane(int8x16_t a, int8x8_t b) { return vcopyq_lane_s8(a, lane1, b, lane2); }
284template <int lane1, int lane2>[[gnu::always_inline]] nce int8x16_t copy_lane(int8x16_t a, int8x16_t b) { return vcopyq_laneq_s8(a, lane1, b, lane2); }
285[[gnu::always_inline]] nce int8x16_t table_extend2_saturate(int8x16_t a, int8x16x2_t t, uint8x16_t idx) { return vqtbx2q_s8(a, t, idx); }
286[[gnu::always_inline]] nce int8x16_t table_extend3_saturate(int8x16_t a, int8x16x3_t t, uint8x16_t idx) { return vqtbx3q_s8(a, t, idx); }
287[[gnu::always_inline]] nce int8x16_t table_extend4_saturate(int8x16_t a, int8x16x4_t t, uint8x16_t idx) { return vqtbx4q_s8(a, t, idx); }
288[[gnu::always_inline]] nce uint16x4_t add_saturate(uint16x4_t a, int16x4_t b) { return vsqadd_u16(a, b); }
289[[gnu::always_inline]] nce uint16x8_t add_narrow_high_high(uint16x4_t r, uint32x4_t a, uint32x4_t b) { return vaddhn_high_u32(r, a, b); }
290[[gnu::always_inline]] nce uint16x8_t add_round_narrow_high_high(uint16x4_t r, uint32x4_t a, uint32x4_t b) { return vraddhn_high_u32(r, a, b); }
291[[gnu::always_inline]] nce uint16x8_t subtract_narrow_high_high(uint16x4_t r, uint32x4_t a, uint32x4_t b) { return vsubhn_high_u32(r, a, b); }
292[[gnu::always_inline]] nce uint16x8_t subtract_round_narrow_high_high(uint16x4_t r, uint32x4_t a, uint32x4_t b) { return vrsubhn_high_u32(r, a, b); }
293[[gnu::always_inline]] nce uint16_t reduce_add(uint16x4_t a) { return vaddv_u16(a); }
294[[gnu::always_inline]] nce uint32_t reduce_add_long(uint16x4_t a) { return vaddlv_u16(a); }
295[[gnu::always_inline]] inline uint16_t reduce_max(uint16x4_t a) { return vmaxv_u16(a); }
296[[gnu::always_inline]] inline uint16_t reduce_min(uint16x4_t a) { return vminv_u16(a); }
297[[gnu::always_inline]] nce uint16x4_t equal_to_zero(uint16x4_t a) { return vceqz_u16(a); }
298template <> [[gnu::always_inline]] nce float64x1_t reinterpret(uint16x4_t a) { return vreinterpret_f64_u16(a); }
299template <int lane>[[gnu::always_inline]] nce uint16x4_t multiply_add_lane(uint16x4_t a, uint16x4_t b, uint16x8_t v) { return vmla_laneq_u16(a, b, v, lane); }
300template <int lane>[[gnu::always_inline]] nce uint16x4_t multiply_subtract_lane(uint16x4_t a, uint16x4_t b, uint16x8_t v) { return vmls_laneq_u16(a, b, v, lane); }
301[[gnu::always_inline]] nce uint16x4_t zip1(uint16x4_t a, uint16x4_t b) { return vzip1_u16(a, b); }
302[[gnu::always_inline]] nce uint16x4_t zip2(uint16x4_t a, uint16x4_t b) { return vzip2_u16(a, b); }
303[[gnu::always_inline]] nce uint16x4_t unzip1(uint16x4_t a, uint16x4_t b) { return vuzp1_u16(a, b); }
304[[gnu::always_inline]] nce uint16x4_t unzip2(uint16x4_t a, uint16x4_t b) { return vuzp2_u16(a, b); }
305[[gnu::always_inline]] nce uint16x4_t transpose_step_1(uint16x4_t a, uint16x4_t b) { return vtrn1_u16(a, b); }
306[[gnu::always_inline]] nce uint16x4_t transpose_step_2(uint16x4_t a, uint16x4_t b) { return vtrn2_u16(a, b); }
307template <int lane>[[gnu::always_inline]] nce uint16x4_t multiply_lane(uint16x4_t a, uint16x8_t v) { return vmul_laneq_u16(a, v, lane); }
308template <int lane>[[gnu::always_inline]] nce uint32x4_t multiply_long_lane(uint16x4_t a, uint16x8_t v) { return vmull_laneq_u16(a, v, lane); }
309template <int n>[[gnu::always_inline]] nce uint16x8_t shift_right_saturate_narrow_unsigned_high(uint16x4_t r, int32x4_t a) { return vqshrun_high_n_s32(r, a, n); }
310template <int n>[[gnu::always_inline]] nce uint16x8_t shift_right_unsigned_saturate_narrow_high(uint16x4_t r, int32x4_t a) { return vqrshrun_high_n_s32(r, a, n); }
311[[gnu::always_inline]] nce uint16x8_t move_unsigned_saturate_narrow_high(uint16x4_t r, int32x4_t a) { return vqmovun_high_s32(r, a); }
312template <int n>[[gnu::always_inline]] nce uint16x8_t shift_right_narrow_high(uint16x4_t r, uint32x4_t a) { return vshrn_high_n_u32(r, a, n); }
313template <int n>[[gnu::always_inline]] nce uint16x8_t shift_right_saturate_narrow_high(uint16x4_t r, uint32x4_t a) { return vqshrn_high_n_u32(r, a, n); }
314template <int n>[[gnu::always_inline]] nce uint16x8_t shift_right_round_saturate_narrow_high(uint16x4_t r, uint32x4_t a) { return vqrshrn_high_n_u32(r, a, n); }
315template <int n>[[gnu::always_inline]] nce uint16x8_t shift_right_round_narrow_high(uint16x4_t r, uint32x4_t a) { return vrshrn_high_n_u32(r, a, n); }
316[[gnu::always_inline]] nce uint16x8_t move_saturate_narrow_high(uint16x4_t r, uint32x4_t a) { return vqmovn_high_u32(r, a); }
317template <int lane1, int lane2>[[gnu::always_inline]] nce uint16x4_t copy_lane(uint16x4_t a, uint16x4_t b) { return vcopy_lane_u16(a, lane1, b, lane2); }
318template <int lane1, int lane2>[[gnu::always_inline]] nce uint16x4_t copy_lane(uint16x4_t a, uint16x8_t b) { return vcopy_laneq_u16(a, lane1, b, lane2); }
319[[gnu::always_inline]] nce uint16x8_t add_high(uint16x8_t a, uint8x16_t b) { return vaddw_high_u8(a, b); }
320[[gnu::always_inline]] nce uint16x8_t multiply_add_long_high(uint16x8_t a, uint8x16_t b, uint8x16_t c) { return vmlal_high_u8(a, b, c); }
321[[gnu::always_inline]] nce uint16x8_t multiply_subtract_long_high(uint16x8_t a, uint8x16_t b, uint8x16_t c) { return vmlsl_high_u8(a, b, c); }
322[[gnu::always_inline]] nce uint16x8_t subtract_high(uint16x8_t a, uint8x16_t b) { return vsubw_high_u8(a, b); }
323[[gnu::always_inline]] nce uint16x8_t subtract_absolute_add_high(uint16x8_t a, uint8x16_t b, uint8x16_t c) { return vabal_high_u8(a, b, c); }
324[[gnu::always_inline]] nce uint32x4_t add_long_high(uint16x8_t a, uint16x8_t b) { return vaddl_high_u16(a, b); }
325[[gnu::always_inline]] nce uint32x4_t multiply_long_high(uint16x8_t a, uint16x8_t b) { return vmull_high_u16(a, b); }
326[[gnu::always_inline]] nce uint32x4_t subtract_long_high(uint16x8_t a, uint16x8_t b) { return vsubl_high_u16(a, b); }
327[[gnu::always_inline]] nce uint32x4_t subtract_absolute_long_high(uint16x8_t a, uint16x8_t b) { return vabdl_high_u16(a, b); }
328[[gnu::always_inline]] nce uint16x8_t pairwise_add(uint16x8_t a, uint16x8_t b) { return vpaddq_u16(a, b); }
329[[gnu::always_inline]] nce uint16x8_t pairwise_max(uint16x8_t a, uint16x8_t b) { return vpmaxq_u16(a, b); }
330[[gnu::always_inline]] nce uint16x8_t pairwise_min(uint16x8_t a, uint16x8_t b) { return vpminq_u16(a, b); }
331[[gnu::always_inline]] nce uint16x8_t add_saturate(uint16x8_t a, int16x8_t b) { return vsqaddq_u16(a, b); }
332[[gnu::always_inline]] nce uint16_t reduce_add(uint16x8_t a) { return vaddvq_u16(a); }
333[[gnu::always_inline]] nce uint32_t reduce_add_long(uint16x8_t a) { return vaddlvq_u16(a); }
334[[gnu::always_inline]] inline uint16_t reduce_max(uint16x8_t a) { return vmaxvq_u16(a); }
335[[gnu::always_inline]] inline uint16_t reduce_min(uint16x8_t a) { return vminvq_u16(a); }
336[[gnu::always_inline]] nce uint16x8_t equal_to_zero(uint16x8_t a) { return vceqzq_u16(a); }
337template <int n>[[gnu::always_inline]] nce uint32x4_t shift_left_long_high(uint16x8_t a) { return vshll_high_n_u16(a, n); }
338template <> [[gnu::always_inline]] nce float64x2_t reinterpret(uint16x8_t a) { return vreinterpretq_f64_u16(a); }
339[[gnu::always_inline]] nce uint32x4_t move_long_high(uint16x8_t a) { return vmovl_high_u16(a); }
340template <int lane>[[gnu::always_inline]] nce uint32x4_t multiply_long_lane_high(uint16x8_t a, uint16x4_t v) { return vmull_high_lane_u16(a, v, lane); }
341template <int lane>[[gnu::always_inline]] nce uint16x8_t multiply_add_lane(uint16x8_t a, uint16x8_t b, uint16x8_t v) { return vmlaq_laneq_u16(a, b, v, lane); }
342template <int lane>[[gnu::always_inline]] nce uint16x8_t multiply_subtract_lane(uint16x8_t a, uint16x8_t b, uint16x8_t v) { return vmlsq_laneq_u16(a, b, v, lane); }
343template <int lane>[[gnu::always_inline]] nce uint16x8_t multiply_lane(uint16x8_t a, uint16x8_t v) { return vmulq_laneq_u16(a, v, lane); }
344template <int lane>[[gnu::always_inline]] nce uint32x4_t multiply_long_lane_high(uint16x8_t a, uint16x8_t v) { return vmull_high_laneq_u16(a, v, lane); }
345template <int lane>[[gnu::always_inline]] nce uint16x4_t duplicate_lane(uint16x8_t vec) { return vdup_laneq_u16(vec, lane); }
346template <int lane>[[gnu::always_inline]] nce uint16x8_t duplicate_lane(uint16x8_t vec) { return vdupq_laneq_u16(vec, lane); }
347[[gnu::always_inline]] nce uint16x8_t zip1(uint16x8_t a, uint16x8_t b) { return vzip1q_u16(a, b); }
348[[gnu::always_inline]] nce uint16x8_t zip2(uint16x8_t a, uint16x8_t b) { return vzip2q_u16(a, b); }
349[[gnu::always_inline]] nce uint16x8_t unzip1(uint16x8_t a, uint16x8_t b) { return vuzp1q_u16(a, b); }
350[[gnu::always_inline]] nce uint16x8_t unzip2(uint16x8_t a, uint16x8_t b) { return vuzp2q_u16(a, b); }
351[[gnu::always_inline]] nce uint16x8_t transpose_step_1(uint16x8_t a, uint16x8_t b) { return vtrn1q_u16(a, b); }
352[[gnu::always_inline]] nce uint16x8_t transpose_step_2(uint16x8_t a, uint16x8_t b) { return vtrn2q_u16(a, b); }
353
354
355[[gnu::always_inline]] nce uint32x4_t multiply_long_high(uint16x8_t a, uint16_t b) { return vmull_high_n_u16(a, b); }
356template <int lane1, int lane2>[[gnu::always_inline]] nce uint16x8_t copy_lane(uint16x8_t a, uint16x4_t b) { return vcopyq_lane_u16(a, lane1, b, lane2); }
357template <int lane1, int lane2>[[gnu::always_inline]] nce uint16x8_t copy_lane(uint16x8_t a, uint16x8_t b) { return vcopyq_laneq_u16(a, lane1, b, lane2); }
358[[gnu::always_inline]] nce int16x4_t add_saturate(int16x4_t a, uint16x4_t b) { return vuqadd_s16(a, b); }
359template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_saturate_long_lane(int16x4_t a, int16x8_t v) { return vqdmull_laneq_s16(a, v, lane); }
360template <int lane>[[gnu::always_inline]] nce int16x4_t multiply_double_saturate_high_lane(int16x4_t a, int16x8_t v) { return vqdmulh_laneq_s16(a, v, lane); }
361template <int lane>[[gnu::always_inline]] nce int16x4_t multiply_double_round_saturate_high_lane(int16x4_t a, int16x8_t v) { return vqrdmulh_laneq_s16(a, v, lane); }
362[[gnu::always_inline]] nce int16x8_t add_narrow_high_high(int16x4_t r, int32x4_t a, int32x4_t b) { return vaddhn_high_s32(r, a, b); }
363[[gnu::always_inline]] nce int16x8_t add_round_narrow_high_high(int16x4_t r, int32x4_t a, int32x4_t b) { return vraddhn_high_s32(r, a, b); }
364[[gnu::always_inline]] nce int16x8_t subtract_narrow_high_high(int16x4_t r, int32x4_t a, int32x4_t b) { return vsubhn_high_s32(r, a, b); }
365[[gnu::always_inline]] nce int16x8_t subtract_round_narrow_high_high(int16x4_t r, int32x4_t a, int32x4_t b) { return vrsubhn_high_s32(r, a, b); }
366[[gnu::always_inline]] nce int16_t reduce_add(int16x4_t a) { return vaddv_s16(a); }
367[[gnu::always_inline]] nce int32_t reduce_add_long(int16x4_t a) { return vaddlv_s16(a); }
368[[gnu::always_inline]] inline int16_t reduce_max(int16x4_t a) { return vmaxv_s16(a); }
369[[gnu::always_inline]] inline int16_t reduce_min(int16x4_t a) { return vminv_s16(a); }
370[[gnu::always_inline]] nce uint16x4_t equal_to_zero(int16x4_t a) { return vceqz_s16(a); }
371[[gnu::always_inline]] nce uint16x4_t greater_than_or_equal_to_zero(int16x4_t a) { return vcgez_s16(a); }
372[[gnu::always_inline]] nce uint16x4_t less_than_or_equal_to_zero(int16x4_t a) { return vclez_s16(a); }
373[[gnu::always_inline]] nce uint16x4_t greater_than_zero(int16x4_t a) { return vcgtz_s16(a); }
374[[gnu::always_inline]] nce uint16x4_t less_than_zero(int16x4_t a) { return vcltz_s16(a); }
375template <> [[gnu::always_inline]] nce float64x1_t reinterpret(int16x4_t a) { return vreinterpret_f64_s16(a); }
376[[gnu::always_inline]] nce int16x4_t zip1(int16x4_t a, int16x4_t b) { return vzip1_s16(a, b); }
377[[gnu::always_inline]] nce int16x4_t zip2(int16x4_t a, int16x4_t b) { return vzip2_s16(a, b); }
378[[gnu::always_inline]] nce int16x4_t unzip1(int16x4_t a, int16x4_t b) { return vuzp1_s16(a, b); }
379[[gnu::always_inline]] nce int16x4_t unzip2(int16x4_t a, int16x4_t b) { return vuzp2_s16(a, b); }
380[[gnu::always_inline]] nce int16x4_t transpose_step_1(int16x4_t a, int16x4_t b) { return vtrn1_s16(a, b); }
381[[gnu::always_inline]] nce int16x4_t transpose_step_2(int16x4_t a, int16x4_t b) { return vtrn2_s16(a, b); }
382[[gnu::always_inline]] nce int16x4_t multiply_double_add_round_saturate_high(int16x4_t a, int16x4_t b, int16x4_t c) { return vqrdmlah_s16(a, b, c); }
383[[gnu::always_inline]] nce int16x4_t multiply_double_subtract_round_saturate_high(int16x4_t a, int16x4_t b, int16x4_t c) { return vqrdmlsh_s16(a, b, c); }
384template <int lane>[[gnu::always_inline]] nce int16x4_t multiply_double_add_round_saturate_high_lane(int16x4_t a, int16x4_t b, int16x4_t v) { return vqrdmlah_lane_s16(a, b, v, lane); }
385template <int lane>[[gnu::always_inline]] nce int16x4_t multiply_double_subtract_round_saturate_high_lane(int16x4_t a, int16x4_t b, int16x4_t v) { return vqrdmlsh_lane_s16(a, b, v, lane); }
386template <int lane>[[gnu::always_inline]] nce int16x4_t multiply_add_lane(int16x4_t a, int16x4_t b, int16x8_t v) { return vmla_laneq_s16(a, b, v, lane); }
387template <int lane>[[gnu::always_inline]] nce int16x4_t multiply_subtract_lane(int16x4_t a, int16x4_t b, int16x8_t v) { return vmls_laneq_s16(a, b, v, lane); }
388template <int lane>[[gnu::always_inline]] nce int16x4_t multiply_double_add_round_saturate_high_lane(int16x4_t a, int16x4_t b, int16x8_t v) { return vqrdmlah_laneq_s16(a, b, v, lane); }
389template <int lane>[[gnu::always_inline]] nce int16x4_t multiply_double_subtract_round_saturate_high_lane(int16x4_t a, int16x4_t b, int16x8_t v) { return vqrdmlsh_laneq_s16(a, b, v, lane); }
390template <int lane>[[gnu::always_inline]] nce int16x4_t multiply_lane(int16x4_t a, int16x8_t v) { return vmul_laneq_s16(a, v, lane); }
391template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_long_lane(int16x4_t a, int16x8_t v) { return vmull_laneq_s16(a, v, lane); }
392template <int n>[[gnu::always_inline]] nce int16x8_t shift_right_narrow_high(int16x4_t r, int32x4_t a) { return vshrn_high_n_s32(r, a, n); }
393template <int n>[[gnu::always_inline]] nce int16x8_t shift_right_saturate_narrow_high(int16x4_t r, int32x4_t a) { return vqshrn_high_n_s32(r, a, n); }
394template <int n>[[gnu::always_inline]] nce int16x8_t shift_right_round_saturate_narrow_high(int16x4_t r, int32x4_t a) { return vqrshrn_high_n_s32(r, a, n); }
395template <int n>[[gnu::always_inline]] nce int16x8_t shift_right_round_narrow_high(int16x4_t r, int32x4_t a) { return vrshrn_high_n_s32(r, a, n); }
396[[gnu::always_inline]] nce int16x8_t move_saturate_narrow_high(int16x4_t r, int32x4_t a) { return vqmovn_high_s32(r, a); }
397template <int lane1, int lane2>[[gnu::always_inline]] nce int16x4_t copy_lane(int16x4_t a, int16x4_t b) { return vcopy_lane_s16(a, lane1, b, lane2); }
398template <int lane1, int lane2>[[gnu::always_inline]] nce int16x4_t copy_lane(int16x4_t a, int16x8_t b) { return vcopy_laneq_s16(a, lane1, b, lane2); }
399[[gnu::always_inline]] nce int16x8_t add_high(int16x8_t a, int8x16_t b) { return vaddw_high_s8(a, b); }
400[[gnu::always_inline]] nce int16x8_t multiply_add_long_high(int16x8_t a, int8x16_t b, int8x16_t c) { return vmlal_high_s8(a, b, c); }
401[[gnu::always_inline]] nce int16x8_t multiply_subtract_long_high(int16x8_t a, int8x16_t b, int8x16_t c) { return vmlsl_high_s8(a, b, c); }
402[[gnu::always_inline]] nce int16x8_t subtract_high(int16x8_t a, int8x16_t b) { return vsubw_high_s8(a, b); }
403[[gnu::always_inline]] nce int16x8_t subtract_absolute_add_high(int16x8_t a, int8x16_t b, int8x16_t c) { return vabal_high_s8(a, b, c); }
404[[gnu::always_inline]] nce int16x8_t add_saturate(int16x8_t a, uint16x8_t b) { return vuqaddq_s16(a, b); }
405template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_saturate_long_lane_high(int16x8_t a, int16x4_t v) { return vqdmull_high_lane_s16(a, v, lane); }
406[[gnu::always_inline]] nce int32x4_t add_long_high(int16x8_t a, int16x8_t b) { return vaddl_high_s16(a, b); }
407[[gnu::always_inline]] nce int32x4_t multiply_double_saturate_long_high(int16x8_t a, int16x8_t b) { return vqdmull_high_s16(a, b); }
408[[gnu::always_inline]] nce int32x4_t multiply_long_high(int16x8_t a, int16x8_t b) { return vmull_high_s16(a, b); }
409template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_saturate_long_lane_high(int16x8_t a, int16x8_t v) { return vqdmull_high_laneq_s16(a, v, lane); }
410template <int lane>[[gnu::always_inline]] nce int16x8_t multiply_double_saturate_high_lane(int16x8_t a, int16x8_t v) { return vqdmulhq_laneq_s16(a, v, lane); }
411template <int lane>[[gnu::always_inline]] nce int16x8_t multiply_double_round_saturate_high_lane(int16x8_t a, int16x8_t v) { return vqrdmulhq_laneq_s16(a, v, lane); }
412[[gnu::always_inline]] nce int32x4_t subtract_long_high(int16x8_t a, int16x8_t b) { return vsubl_high_s16(a, b); }
413[[gnu::always_inline]] nce int32x4_t subtract_absolute_long_high(int16x8_t a, int16x8_t b) { return vabdl_high_s16(a, b); }
414[[gnu::always_inline]] nce int16x8_t pairwise_add(int16x8_t a, int16x8_t b) { return vpaddq_s16(a, b); }
415[[gnu::always_inline]] nce int16x8_t pairwise_max(int16x8_t a, int16x8_t b) { return vpmaxq_s16(a, b); }
416[[gnu::always_inline]] nce int16x8_t pairwise_min(int16x8_t a, int16x8_t b) { return vpminq_s16(a, b); }
417[[gnu::always_inline]] nce int32x4_t multiply_double_saturate_long_high(int16x8_t a, int16_t b) { return vqdmull_high_n_s16(a, b); }
418[[gnu::always_inline]] nce int16_t reduce_add(int16x8_t a) { return vaddvq_s16(a); }
419[[gnu::always_inline]] nce int32_t reduce_add_long(int16x8_t a) { return vaddlvq_s16(a); }
420[[gnu::always_inline]] inline int16_t reduce_max(int16x8_t a) { return vmaxvq_s16(a); }
421[[gnu::always_inline]] inline int16_t reduce_min(int16x8_t a) { return vminvq_s16(a); }
422[[gnu::always_inline]] nce uint16x8_t equal_to_zero(int16x8_t a) { return vceqzq_s16(a); }
423[[gnu::always_inline]] nce uint16x8_t greater_than_or_equal_to_zero(int16x8_t a) { return vcgezq_s16(a); }
424[[gnu::always_inline]] nce uint16x8_t less_than_or_equal_to_zero(int16x8_t a) { return vclezq_s16(a); }
425[[gnu::always_inline]] nce uint16x8_t greater_than_zero(int16x8_t a) { return vcgtzq_s16(a); }
426[[gnu::always_inline]] nce uint16x8_t less_than_zero(int16x8_t a) { return vcltzq_s16(a); }
427template <int n>[[gnu::always_inline]] nce int32x4_t shift_left_long_high(int16x8_t a) { return vshll_high_n_s16(a, n); }
428template <> [[gnu::always_inline]] nce float64x2_t reinterpret(int16x8_t a) { return vreinterpretq_f64_s16(a); }
429[[gnu::always_inline]] nce int32x4_t move_long_high(int16x8_t a) { return vmovl_high_s16(a); }
430template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_long_lane_high(int16x8_t a, int16x4_t v) { return vmull_high_lane_s16(a, v, lane); }
431template <int lane>[[gnu::always_inline]] nce int16x8_t multiply_add_lane(int16x8_t a, int16x8_t b, int16x8_t v) { return vmlaq_laneq_s16(a, b, v, lane); }
432template <int lane>[[gnu::always_inline]] nce int16x8_t multiply_subtract_lane(int16x8_t a, int16x8_t b, int16x8_t v) { return vmlsq_laneq_s16(a, b, v, lane); }
433template <int lane>[[gnu::always_inline]] nce int16x8_t multiply_lane(int16x8_t a, int16x8_t v) { return vmulq_laneq_s16(a, v, lane); }
434template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_long_lane_high(int16x8_t a, int16x8_t v) { return vmull_high_laneq_s16(a, v, lane); }
435template <int lane>[[gnu::always_inline]] nce int16x4_t duplicate_lane(int16x8_t vec) { return vdup_laneq_s16(vec, lane); }
436template <int lane>[[gnu::always_inline]] nce int16x8_t duplicate_lane(int16x8_t vec) { return vdupq_laneq_s16(vec, lane); }
437[[gnu::always_inline]] nce int16x8_t zip1(int16x8_t a, int16x8_t b) { return vzip1q_s16(a, b); }
438[[gnu::always_inline]] nce int16x8_t zip2(int16x8_t a, int16x8_t b) { return vzip2q_s16(a, b); }
439[[gnu::always_inline]] nce int16x8_t unzip1(int16x8_t a, int16x8_t b) { return vuzp1q_s16(a, b); }
440[[gnu::always_inline]] nce int16x8_t unzip2(int16x8_t a, int16x8_t b) { return vuzp2q_s16(a, b); }
441[[gnu::always_inline]] nce int16x8_t transpose_step_1(int16x8_t a, int16x8_t b) { return vtrn1q_s16(a, b); }
442[[gnu::always_inline]] nce int16x8_t transpose_step_2(int16x8_t a, int16x8_t b) { return vtrn2q_s16(a, b); }
443template <int lane>[[gnu::always_inline]] nce int16x8_t multiply_double_add_round_saturate_high_lane(int16x8_t a, int16x8_t b, int16x4_t v) { return vqrdmlahq_lane_s16(a, b, v, lane); }
444template <int lane>[[gnu::always_inline]] nce int16x8_t multiply_double_subtract_round_saturate_high_lane(int16x8_t a, int16x8_t b, int16x4_t v) { return vqrdmlshq_lane_s16(a, b, v, lane); }
445[[gnu::always_inline]] nce int16x8_t multiply_double_add_round_saturate_high(int16x8_t a, int16x8_t b, int16x8_t c) { return vqrdmlahq_s16(a, b, c); }
446[[gnu::always_inline]] nce int16x8_t multiply_double_subtract_round_saturate_high(int16x8_t a, int16x8_t b, int16x8_t c) { return vqrdmlshq_s16(a, b, c); }
447template <int lane>[[gnu::always_inline]] nce int16x8_t multiply_double_add_round_saturate_high_lane(int16x8_t a, int16x8_t b, int16x8_t v) { return vqrdmlahq_laneq_s16(a, b, v, lane); }
448template <int lane>[[gnu::always_inline]] nce int16x8_t multiply_double_subtract_round_saturate_high_lane(int16x8_t a, int16x8_t b, int16x8_t v) { return vqrdmlshq_laneq_s16(a, b, v, lane); }
449
450
451[[gnu::always_inline]] nce int32x4_t multiply_long_high(int16x8_t a, int16_t b) { return vmull_high_n_s16(a, b); }
452template <int lane1, int lane2>[[gnu::always_inline]] nce int16x8_t copy_lane(int16x8_t a, int16x4_t b) { return vcopyq_lane_s16(a, lane1, b, lane2); }
453template <int lane1, int lane2>[[gnu::always_inline]] nce int16x8_t copy_lane(int16x8_t a, int16x8_t b) { return vcopyq_laneq_s16(a, lane1, b, lane2); }
454template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_double_saturate_long_lane(int32x2_t a, int32x4_t v) { return vqdmull_laneq_s32(a, v, lane); }
455template <int lane>[[gnu::always_inline]] nce int32x2_t multiply_double_saturate_high_lane(int32x2_t a, int32x4_t v) { return vqdmulh_laneq_s32(a, v, lane); }
456template <int lane>[[gnu::always_inline]] nce int32x2_t multiply_double_round_saturate_high_lane(int32x2_t a, int32x4_t v) { return vqrdmulh_laneq_s32(a, v, lane); }
457[[gnu::always_inline]] nce int32x2_t add_saturate(int32x2_t a, uint32x2_t b) { return vuqadd_s32(a, b); }
458[[gnu::always_inline]] nce int32x4_t add_narrow_high_high(int32x2_t r, int64x2_t a, int64x2_t b) { return vaddhn_high_s64(r, a, b); }
459[[gnu::always_inline]] nce int32x4_t add_round_narrow_high_high(int32x2_t r, int64x2_t a, int64x2_t b) { return vraddhn_high_s64(r, a, b); }
460[[gnu::always_inline]] nce int32x4_t subtract_narrow_high_high(int32x2_t r, int64x2_t a, int64x2_t b) { return vsubhn_high_s64(r, a, b); }
461[[gnu::always_inline]] nce int32x4_t subtract_round_narrow_high_high(int32x2_t r, int64x2_t a, int64x2_t b) { return vrsubhn_high_s64(r, a, b); }
462[[gnu::always_inline]] nce int32_t reduce_add(int32x2_t a) { return vaddv_s32(a); }
463[[gnu::always_inline]] nce int64_t reduce_add_long(int32x2_t a) { return vaddlv_s32(a); }
464[[gnu::always_inline]] inline int32_t reduce_max(int32x2_t a) { return vmaxv_s32(a); }
465[[gnu::always_inline]] inline int32_t reduce_min(int32x2_t a) { return vminv_s32(a); }
466[[gnu::always_inline]] nce uint32x2_t equal_to_zero(int32x2_t a) { return vceqz_s32(a); }
467[[gnu::always_inline]] nce uint32x2_t greater_than_or_equal_to_zero(int32x2_t a) { return vcgez_s32(a); }
468[[gnu::always_inline]] nce uint32x2_t less_than_or_equal_to_zero(int32x2_t a) { return vclez_s32(a); }
469[[gnu::always_inline]] nce uint32x2_t greater_than_zero(int32x2_t a) { return vcgtz_s32(a); }
470[[gnu::always_inline]] nce uint32x2_t less_than_zero(int32x2_t a) { return vcltz_s32(a); }
471template <> [[gnu::always_inline]] nce float64x1_t reinterpret(int32x2_t a) { return vreinterpret_f64_s32(a); }
472[[gnu::always_inline]] nce int32x2_t zip1(int32x2_t a, int32x2_t b) { return vzip1_s32(a, b); }
473[[gnu::always_inline]] nce int32x2_t zip2(int32x2_t a, int32x2_t b) { return vzip2_s32(a, b); }
474[[gnu::always_inline]] nce int32x2_t unzip1(int32x2_t a, int32x2_t b) { return vuzp1_s32(a, b); }
475[[gnu::always_inline]] nce int32x2_t unzip2(int32x2_t a, int32x2_t b) { return vuzp2_s32(a, b); }
476[[gnu::always_inline]] nce int32x2_t transpose_step_1(int32x2_t a, int32x2_t b) { return vtrn1_s32(a, b); }
477[[gnu::always_inline]] nce int32x2_t transpose_step_2(int32x2_t a, int32x2_t b) { return vtrn2_s32(a, b); }
478[[gnu::always_inline]] nce int32x2_t multiply_double_add_round_saturate_high(int32x2_t a, int32x2_t b, int32x2_t c) { return vqrdmlah_s32(a, b, c); }
479[[gnu::always_inline]] nce int32x2_t multiply_double_subtract_round_saturate_high(int32x2_t a, int32x2_t b, int32x2_t c) { return vqrdmlsh_s32(a, b, c); }
480template <int lane>[[gnu::always_inline]] nce int32x2_t multiply_double_add_round_saturate_high_lane(int32x2_t a, int32x2_t b, int32x2_t v) { return vqrdmlah_lane_s32(a, b, v, lane); }
481template <int lane>[[gnu::always_inline]] nce int32x2_t multiply_double_subtract_round_saturate_high_lane(int32x2_t a, int32x2_t b, int32x2_t v) { return vqrdmlsh_lane_s32(a, b, v, lane); }
482template <int lane>[[gnu::always_inline]] nce int32x2_t multiply_add_lane(int32x2_t a, int32x2_t b, int32x4_t v) { return vmla_laneq_s32(a, b, v, lane); }
483template <int lane>[[gnu::always_inline]] nce int32x2_t multiply_subtract_lane(int32x2_t a, int32x2_t b, int32x4_t v) { return vmls_laneq_s32(a, b, v, lane); }
484template <int lane>[[gnu::always_inline]] nce int32x2_t multiply_double_add_round_saturate_high_lane(int32x2_t a, int32x2_t b, int32x4_t v) { return vqrdmlah_laneq_s32(a, b, v, lane); }
485template <int lane>[[gnu::always_inline]] nce int32x2_t multiply_double_subtract_round_saturate_high_lane(int32x2_t a, int32x2_t b, int32x4_t v) { return vqrdmlsh_laneq_s32(a, b, v, lane); }
486template <int lane>[[gnu::always_inline]] nce int32x2_t multiply_lane(int32x2_t a, int32x4_t v) { return vmul_laneq_s32(a, v, lane); }
487template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_long_lane(int32x2_t a, int32x4_t v) { return vmull_laneq_s32(a, v, lane); }
488template <int n>[[gnu::always_inline]] nce int32x4_t shift_right_narrow_high(int32x2_t r, int64x2_t a) { return vshrn_high_n_s64(r, a, n); }
489template <int n>[[gnu::always_inline]] nce int32x4_t shift_right_saturate_narrow_high(int32x2_t r, int64x2_t a) { return vqshrn_high_n_s64(r, a, n); }
490template <int n>[[gnu::always_inline]] nce int32x4_t shift_right_round_saturate_narrow_high(int32x2_t r, int64x2_t a) { return vqrshrn_high_n_s64(r, a, n); }
491template <int n>[[gnu::always_inline]] nce int32x4_t shift_right_round_narrow_high(int32x2_t r, int64x2_t a) { return vrshrn_high_n_s64(r, a, n); }
492[[gnu::always_inline]] nce int32x4_t move_saturate_narrow_high(int32x2_t r, int64x2_t a) { return vqmovn_high_s64(r, a); }
493template <int lane1, int lane2>[[gnu::always_inline]] nce int32x2_t copy_lane(int32x2_t a, int32x2_t b) { return vcopy_lane_s32(a, lane1, b, lane2); }
494template <int lane1, int lane2>[[gnu::always_inline]] nce int32x2_t copy_lane(int32x2_t a, int32x4_t b) { return vcopy_laneq_s32(a, lane1, b, lane2); }
495template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_add_saturate_long_lane(int32x4_t a, int16x4_t b, int16x8_t v) { return vqdmlal_laneq_s16(a, b, v, lane); }
496template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_subtract_saturate_long_lane(int32x4_t a, int16x4_t b, int16x8_t v) { return vqdmlsl_laneq_s16(a, b, v, lane); }
497[[gnu::always_inline]] nce int32x4_t add_high(int32x4_t a, int16x8_t b) { return vaddw_high_s16(a, b); }
498template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_add_saturate_long_lane_high(int32x4_t a, int16x8_t b, int16x4_t v) { return vqdmlal_high_lane_s16(a, b, v, lane); }
499template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_subtract_saturate_long_lane_high(int32x4_t a, int16x8_t b, int16x4_t v) { return vqdmlsl_high_lane_s16(a, b, v, lane); }
500[[gnu::always_inline]] nce int32x4_t multiply_add_long_high(int32x4_t a, int16x8_t b, int16x8_t c) { return vmlal_high_s16(a, b, c); }
501[[gnu::always_inline]] nce int32x4_t multiply_subtract_long_high(int32x4_t a, int16x8_t b, int16x8_t c) { return vmlsl_high_s16(a, b, c); }
502[[gnu::always_inline]] nce int32x4_t multiply_double_add_saturate_long_high(int32x4_t a, int16x8_t b, int16x8_t c) { return vqdmlal_high_s16(a, b, c); }
503[[gnu::always_inline]] nce int32x4_t multiply_double_subtract_saturate_long_high(int32x4_t a, int16x8_t b, int16x8_t c) { return vqdmlsl_high_s16(a, b, c); }
504template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_add_saturate_long_lane_high(int32x4_t a, int16x8_t b, int16x8_t v) { return vqdmlal_high_laneq_s16(a, b, v, lane); }
505template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_subtract_saturate_long_lane_high(int32x4_t a, int16x8_t b, int16x8_t v) { return vqdmlsl_high_laneq_s16(a, b, v, lane); }
506[[gnu::always_inline]] nce int32x4_t subtract_absolute_add_high(int32x4_t a, int16x8_t b, int16x8_t c) { return vabal_high_s16(a, b, c); }
507[[gnu::always_inline]] nce int32x4_t multiply_double_add_saturate_long_high(int32x4_t a, int16x8_t b, int16_t c) { return vqdmlal_high_n_s16(a, b, c); }
508[[gnu::always_inline]] nce int32x4_t multiply_double_subtract_saturate_long_high(int32x4_t a, int16x8_t b, int16_t c) { return vqdmlsl_high_n_s16(a, b, c); }
509[[gnu::always_inline]] nce int32x4_t subtract_high(int32x4_t a, int16x8_t b) { return vsubw_high_s16(a, b); }
510template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_double_saturate_long_lane_high(int32x4_t a, int32x2_t v) { return vqdmull_high_lane_s32(a, v, lane); }
511[[gnu::always_inline]] nce int64x2_t add_long_high(int32x4_t a, int32x4_t b) { return vaddl_high_s32(a, b); }
512[[gnu::always_inline]] nce int64x2_t multiply_double_saturate_long_high(int32x4_t a, int32x4_t b) { return vqdmull_high_s32(a, b); }
513[[gnu::always_inline]] nce int64x2_t multiply_long_high(int32x4_t a, int32x4_t b) { return vmull_high_s32(a, b); }
514template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_double_saturate_long_lane_high(int32x4_t a, int32x4_t v) { return vqdmull_high_laneq_s32(a, v, lane); }
515template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_saturate_high_lane(int32x4_t a, int32x4_t v) { return vqdmulhq_laneq_s32(a, v, lane); }
516template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_round_saturate_high_lane(int32x4_t a, int32x4_t v) { return vqrdmulhq_laneq_s32(a, v, lane); }
517[[gnu::always_inline]] nce int64x2_t subtract_long_high(int32x4_t a, int32x4_t b) { return vsubl_high_s32(a, b); }
518[[gnu::always_inline]] nce int64x2_t subtract_absolute_long_high(int32x4_t a, int32x4_t b) { return vabdl_high_s32(a, b); }
519[[gnu::always_inline]] nce int32x4_t pairwise_add(int32x4_t a, int32x4_t b) { return vpaddq_s32(a, b); }
520[[gnu::always_inline]] nce int32x4_t pairwise_max(int32x4_t a, int32x4_t b) { return vpmaxq_s32(a, b); }
521[[gnu::always_inline]] nce int32x4_t pairwise_min(int32x4_t a, int32x4_t b) { return vpminq_s32(a, b); }
522[[gnu::always_inline]] nce int32x4_t add_saturate(int32x4_t a, uint32x4_t b) { return vuqaddq_s32(a, b); }
523[[gnu::always_inline]] nce int64x2_t multiply_double_saturate_long_high(int32x4_t a, int32_t b) { return vqdmull_high_n_s32(a, b); }
524[[gnu::always_inline]] nce int32_t reduce_add(int32x4_t a) { return vaddvq_s32(a); }
525[[gnu::always_inline]] nce int64_t reduce_add_long(int32x4_t a) { return vaddlvq_s32(a); }
526[[gnu::always_inline]] inline int32_t reduce_max(int32x4_t a) { return vmaxvq_s32(a); }
527[[gnu::always_inline]] inline int32_t reduce_min(int32x4_t a) { return vminvq_s32(a); }
528[[gnu::always_inline]] nce uint32x4_t equal_to_zero(int32x4_t a) { return vceqzq_s32(a); }
529[[gnu::always_inline]] nce uint32x4_t greater_than_or_equal_to_zero(int32x4_t a) { return vcgezq_s32(a); }
530[[gnu::always_inline]] nce uint32x4_t less_than_or_equal_to_zero(int32x4_t a) { return vclezq_s32(a); }
531[[gnu::always_inline]] nce uint32x4_t greater_than_zero(int32x4_t a) { return vcgtzq_s32(a); }
532[[gnu::always_inline]] nce uint32x4_t less_than_zero(int32x4_t a) { return vcltzq_s32(a); }
533template <int n>[[gnu::always_inline]] nce int64x2_t shift_left_long_high(int32x4_t a) { return vshll_high_n_s32(a, n); }
534template <> [[gnu::always_inline]] nce float64x2_t reinterpret(int32x4_t a) { return vreinterpretq_f64_s32(a); }
535[[gnu::always_inline]] nce int64x2_t move_long_high(int32x4_t a) { return vmovl_high_s32(a); }
536template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_add_long_lane(int32x4_t a, int16x4_t b, int16x8_t v) { return vmlal_laneq_s16(a, b, v, lane); }
537template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_subtract_long_lane(int32x4_t a, int16x4_t b, int16x8_t v) { return vmlsl_laneq_s16(a, b, v, lane); }
538template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_add_long_lane_high(int32x4_t a, int16x8_t b, int16x4_t v) { return vmlal_high_lane_s16(a, b, v, lane); }
539template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_subtract_long_lane_high(int32x4_t a, int16x8_t b, int16x4_t v) { return vmlsl_high_lane_s16(a, b, v, lane); }
540template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_add_long_lane_high(int32x4_t a, int16x8_t b, int16x8_t v) { return vmlal_high_laneq_s16(a, b, v, lane); }
541template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_subtract_long_lane_high(int32x4_t a, int16x8_t b, int16x8_t v) { return vmlsl_high_laneq_s16(a, b, v, lane); }
542[[gnu::always_inline]] nce int32x4_t multiply_add_long_high(int32x4_t a, int16x8_t b, int16_t c) { return vmlal_high_n_s16(a, b, c); }
543[[gnu::always_inline]] nce int32x4_t multiply_subtract_long_high(int32x4_t a, int16x8_t b, int16_t c) { return vmlsl_high_n_s16(a, b, c); }
544template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_long_lane_high(int32x4_t a, int32x2_t v) { return vmull_high_lane_s32(a, v, lane); }
545template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_add_lane(int32x4_t a, int32x4_t b, int32x4_t v) { return vmlaq_laneq_s32(a, b, v, lane); }
546template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_subtract_lane(int32x4_t a, int32x4_t b, int32x4_t v) { return vmlsq_laneq_s32(a, b, v, lane); }
547template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_lane(int32x4_t a, int32x4_t v) { return vmulq_laneq_s32(a, v, lane); }
548template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_long_lane_high(int32x4_t a, int32x4_t v) { return vmull_high_laneq_s32(a, v, lane); }
549template <int lane>[[gnu::always_inline]] nce int32x2_t duplicate_lane(int32x4_t vec) { return vdup_laneq_s32(vec, lane); }
550template <int lane>[[gnu::always_inline]] nce int32x4_t duplicate_lane_quad(int32x4_t vec) { return vdupq_laneq_s32(vec, lane); }
551[[gnu::always_inline]] nce int32x4_t zip1(int32x4_t a, int32x4_t b) { return vzip1q_s32(a, b); }
552[[gnu::always_inline]] nce int32x4_t zip2(int32x4_t a, int32x4_t b) { return vzip2q_s32(a, b); }
553[[gnu::always_inline]] nce int32x4_t unzip1(int32x4_t a, int32x4_t b) { return vuzp1q_s32(a, b); }
554[[gnu::always_inline]] nce int32x4_t unzip2(int32x4_t a, int32x4_t b) { return vuzp2q_s32(a, b); }
555[[gnu::always_inline]] nce int32x4_t transpose_step_1(int32x4_t a, int32x4_t b) { return vtrn1q_s32(a, b); }
556[[gnu::always_inline]] nce int32x4_t transpose_step_2(int32x4_t a, int32x4_t b) { return vtrn2q_s32(a, b); }
557template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_add_round_saturate_high_lane(int32x4_t a, int32x4_t b, int32x2_t v) { return vqrdmlahq_lane_s32(a, b, v, lane); }
558template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_subtract_round_saturate_high_lane(int32x4_t a, int32x4_t b, int32x2_t v) { return vqrdmlshq_lane_s32(a, b, v, lane); }
559[[gnu::always_inline]] nce int32x4_t multiply_double_add_round_saturate_high(int32x4_t a, int32x4_t b, int32x4_t c) { return vqrdmlahq_s32(a, b, c); }
560[[gnu::always_inline]] nce int32x4_t multiply_double_subtract_round_saturate_high(int32x4_t a, int32x4_t b, int32x4_t c) { return vqrdmlshq_s32(a, b, c); }
561template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_add_round_saturate_high_lane(int32x4_t a, int32x4_t b, int32x4_t v) { return vqrdmlahq_laneq_s32(a, b, v, lane); }
562template <int lane>[[gnu::always_inline]] nce int32x4_t multiply_double_subtract_round_saturate_high_lane(int32x4_t a, int32x4_t b, int32x4_t v) { return vqrdmlshq_laneq_s32(a, b, v, lane); }
563
564
565[[gnu::always_inline]] nce int64x2_t multiply_long_high(int32x4_t a, int32_t b) { return vmull_high_n_s32(a, b); }
566template <int lane1, int lane2>[[gnu::always_inline]] nce int32x4_t copy_lane(int32x4_t a, int32x2_t b) { return vcopyq_lane_s32(a, lane1, b, lane2); }
567template <int lane1, int lane2>[[gnu::always_inline]] nce int32x4_t copy_lane(int32x4_t a, int32x4_t b) { return vcopyq_laneq_s32(a, lane1, b, lane2); }
568[[gnu::always_inline]] nce uint64x1_t equal(uint64x1_t a, uint64x1_t b) { return vceq_u64(a, b); }
569[[gnu::always_inline]] nce uint64x1_t add_saturate(uint64x1_t a, int64x1_t b) { return vsqadd_u64(a, b); }
570[[gnu::always_inline]] nce uint64x1_t equal_to_zero(uint64x1_t a) { return vceqz_u64(a); }
571[[gnu::always_inline]] nce uint64x1_t greater_than_or_equal(uint64x1_t a, uint64x1_t b) { return vcge_u64(a, b); }
572[[gnu::always_inline]] nce uint64x1_t less_than_or_equal(uint64x1_t a, uint64x1_t b) { return vcle_u64(a, b); }
573[[gnu::always_inline]] nce uint64x1_t greater_than(uint64x1_t a, uint64x1_t b) { return vcgt_u64(a, b); }
574[[gnu::always_inline]] nce uint64x1_t less_than(uint64x1_t a, uint64x1_t b) { return vclt_u64(a, b); }
575[[gnu::always_inline]] nce uint64x1_t compare_test_nonzero(uint64x1_t a, uint64x1_t b) { return vtst_u64(a, b); }
576template <> [[gnu::always_inline]] nce float64x1_t convert(uint64x1_t a) { return vcvt_f64_u64(a); }
577template <int n>[[gnu::always_inline]] nce float64x1_t convert(uint64x1_t a) { return vcvt_n_f64_u64(a, n); }
578template <> [[gnu::always_inline]] nce float64x1_t reinterpret(uint64x1_t a) { return vreinterpret_f64_u64(a); }
579[[gnu::always_inline]] nce float64x1_t bitwise_select(uint64x1_t a, float64x1_t b, float64x1_t c) { return vbsl_f64(a, b, c); }
580template <int lane1, int lane2>[[gnu::always_inline]] nce uint64x1_t copy_lane(uint64x1_t a, uint64x1_t b) { return vcopy_lane_u64(a, lane1, b, lane2); }
581template <int lane1, int lane2>[[gnu::always_inline]] nce uint64x1_t copy_lane(uint64x1_t a, uint64x2_t b) { return vcopy_laneq_u64(a, lane1, b, lane2); }
582template <int lane>[[gnu::always_inline]] nce uint64_t duplicate_lane(uint64x1_t vec) { return vdupd_lane_u64(vec, lane); }
583[[gnu::always_inline]] nce uint64x2_t pairwise_add(uint64x2_t a, uint64x2_t b) { return vpaddq_u64(a, b); }
584[[gnu::always_inline]] nce uint64x2_t subtract_absolute_add_high(uint64x2_t a, uint32x4_t b, uint32x4_t c) { return vabal_high_u32(a, b, c); }
585[[gnu::always_inline]] nce uint64_t pairwise_add(uint64x2_t a) { return vpaddd_u64(a); }
586[[gnu::always_inline]] nce uint64_t reduce_add(uint64x2_t a) { return vaddvq_u64(a); }
587[[gnu::always_inline]] nce uint64x2_t equal(uint64x2_t a, uint64x2_t b) { return vceqq_u64(a, b); }
588[[gnu::always_inline]] nce uint64x2_t add_high(uint64x2_t a, uint32x4_t b) { return vaddw_high_u32(a, b); }
589[[gnu::always_inline]] nce uint64x2_t multiply_add_long_high(uint64x2_t a, uint32x4_t b, uint32x4_t c) { return vmlal_high_u32(a, b, c); }
590[[gnu::always_inline]] nce uint64x2_t multiply_subtract_long_high(uint64x2_t a, uint32x4_t b, uint32x4_t c) { return vmlsl_high_u32(a, b, c); }
591[[gnu::always_inline]] nce uint64x2_t subtract_high(uint64x2_t a, uint32x4_t b) { return vsubw_high_u32(a, b); }
592[[gnu::always_inline]] nce uint64x2_t add_saturate(uint64x2_t a, int64x2_t b) { return vsqaddq_u64(a, b); }
593[[gnu::always_inline]] nce uint64x2_t equal_to_zero(uint64x2_t a) { return vceqzq_u64(a); }
594[[gnu::always_inline]] nce uint64x2_t greater_than_or_equal(uint64x2_t a, uint64x2_t b) { return vcgeq_u64(a, b); }
595[[gnu::always_inline]] nce uint64x2_t less_than_or_equal(uint64x2_t a, uint64x2_t b) { return vcleq_u64(a, b); }
596[[gnu::always_inline]] nce uint64x2_t greater_than(uint64x2_t a, uint64x2_t b) { return vcgtq_u64(a, b); }
597[[gnu::always_inline]] nce uint64x2_t less_than(uint64x2_t a, uint64x2_t b) { return vcltq_u64(a, b); }
598[[gnu::always_inline]] nce uint64x2_t compare_test_nonzero(uint64x2_t a, uint64x2_t b) { return vtstq_u64(a, b); }
599template <> [[gnu::always_inline]] nce float64x2_t convert(uint64x2_t a) { return vcvtq_f64_u64(a); }
600template <int n>[[gnu::always_inline]] nce float64x2_t convert(uint64x2_t a) { return vcvtq_n_f64_u64(a, n); }
601template <int lane>[[gnu::always_inline]] nce uint64x1_t duplicate_lane(uint64x2_t vec) { return vdup_laneq_u64(vec, lane); }
602template <int lane>[[gnu::always_inline]] nce uint64x2_t duplicate_lane_quad(uint64x2_t vec) { return vdupq_laneq_u64(vec, lane); }
603template <int lane>[[gnu::always_inline]] nce uint64_t duplicate_lane(uint64x2_t vec) { return vdupd_laneq_u64(vec, lane); }
604[[gnu::always_inline]] nce uint64x2_t zip1(uint64x2_t a, uint64x2_t b) { return vzip1q_u64(a, b); }
605[[gnu::always_inline]] nce uint64x2_t zip2(uint64x2_t a, uint64x2_t b) { return vzip2q_u64(a, b); }
606[[gnu::always_inline]] nce uint64x2_t unzip1(uint64x2_t a, uint64x2_t b) { return vuzp1q_u64(a, b); }
607[[gnu::always_inline]] nce uint64x2_t unzip2(uint64x2_t a, uint64x2_t b) { return vuzp2q_u64(a, b); }
608[[gnu::always_inline]] nce uint64x2_t transpose_step_1(uint64x2_t a, uint64x2_t b) { return vtrn1q_u64(a, b); }
609[[gnu::always_inline]] nce uint64x2_t transpose_step_2(uint64x2_t a, uint64x2_t b) { return vtrn2q_u64(a, b); }
610
611
612template <int lane>[[gnu::always_inline]] nce uint64x2_t multiply_add_long_lane(uint64x2_t a, uint32x2_t b, uint32x4_t v) { return vmlal_laneq_u32(a, b, v, lane); }
613template <int lane>[[gnu::always_inline]] nce uint64x2_t multiply_subtract_long_lane(uint64x2_t a, uint32x2_t b, uint32x4_t v) { return vmlsl_laneq_u32(a, b, v, lane); }
614template <int lane>[[gnu::always_inline]] nce uint64x2_t multiply_add_long_lane_high(uint64x2_t a, uint32x4_t b, uint32x2_t v) { return vmlal_high_lane_u32(a, b, v, lane); }
615template <int lane>[[gnu::always_inline]] nce uint64x2_t multiply_subtract_long_lane_high(uint64x2_t a, uint32x4_t b, uint32x2_t v) { return vmlsl_high_lane_u32(a, b, v, lane); }
616template <int lane>[[gnu::always_inline]] nce uint64x2_t multiply_add_long_lane_high(uint64x2_t a, uint32x4_t b, uint32x4_t v) { return vmlal_high_laneq_u32(a, b, v, lane); }
617template <int lane>[[gnu::always_inline]] nce uint64x2_t multiply_subtract_long_lane_high(uint64x2_t a, uint32x4_t b, uint32x4_t v) { return vmlsl_high_laneq_u32(a, b, v, lane); }
618[[gnu::always_inline]] nce uint64x2_t multiply_add_long_high(uint64x2_t a, uint32x4_t b, uint32_t c) { return vmlal_high_n_u32(a, b, c); }
619[[gnu::always_inline]] nce uint64x2_t multiply_subtract_long_high(uint64x2_t a, uint32x4_t b, uint32_t c) { return vmlsl_high_n_u32(a, b, c); }
620[[gnu::always_inline]] nce float64x2_t bitwise_select(uint64x2_t a, float64x2_t b, float64x2_t c) { return vbslq_f64(a, b, c); }
621template <int lane1, int lane2>[[gnu::always_inline]] nce uint64x2_t copy_lane(uint64x2_t a, uint64x1_t b) { return vcopyq_lane_u64(a, lane1, b, lane2); }
622template <int lane1, int lane2>[[gnu::always_inline]] nce uint64x2_t copy_lane(uint64x2_t a, uint64x2_t b) { return vcopyq_laneq_u64(a, lane1, b, lane2); }
623[[gnu::always_inline]] nce uint32x2_t add_saturate(uint32x2_t a, int32x2_t b) { return vsqadd_u32(a, b); }
624[[gnu::always_inline]] nce uint32x4_t add_narrow_high_high(uint32x2_t r, uint64x2_t a, uint64x2_t b) { return vaddhn_high_u64(r, a, b); }
625[[gnu::always_inline]] nce uint32x4_t add_round_narrow_high_high(uint32x2_t r, uint64x2_t a, uint64x2_t b) { return vraddhn_high_u64(r, a, b); }
626[[gnu::always_inline]] nce uint32x4_t subtract_narrow_high_high(uint32x2_t r, uint64x2_t a, uint64x2_t b) { return vsubhn_high_u64(r, a, b); }
627[[gnu::always_inline]] nce uint32x4_t subtract_round_narrow_high_high(uint32x2_t r, uint64x2_t a, uint64x2_t b) { return vrsubhn_high_u64(r, a, b); }
628[[gnu::always_inline]] nce uint32_t reduce_add(uint32x2_t a) { return vaddv_u32(a); }
629[[gnu::always_inline]] nce uint64_t reduce_add_long(uint32x2_t a) { return vaddlv_u32(a); }
630[[gnu::always_inline]] inline uint32_t reduce_max(uint32x2_t a) { return vmaxv_u32(a); }
631[[gnu::always_inline]] inline uint32_t reduce_min(uint32x2_t a) { return vminv_u32(a); }
632[[gnu::always_inline]] nce uint32x2_t equal_to_zero(uint32x2_t a) { return vceqz_u32(a); }
633template <int n>[[gnu::always_inline]] nce uint32x4_t shift_right_narrow_high(uint32x2_t r, uint64x2_t a) { return vshrn_high_n_u64(r, a, n); }
634template <int n>[[gnu::always_inline]] nce uint32x4_t shift_right_saturate_narrow_high(uint32x2_t r, uint64x2_t a) { return vqshrn_high_n_u64(r, a, n); }
635template <int n>[[gnu::always_inline]] nce uint32x4_t shift_right_round_saturate_narrow_high(uint32x2_t r, uint64x2_t a) { return vqrshrn_high_n_u64(r, a, n); }
636template <int n>[[gnu::always_inline]] nce uint32x4_t shift_right_round_narrow_high(uint32x2_t r, uint64x2_t a) { return vrshrn_high_n_u64(r, a, n); }
637template <> [[gnu::always_inline]] nce float64x1_t reinterpret(uint32x2_t a) { return vreinterpret_f64_u32(a); }
638[[gnu::always_inline]] nce uint32x4_t move_saturate_narrow_high(uint32x2_t r, uint64x2_t a) { return vqmovn_high_u64(r, a); }
639template <int lane>[[gnu::always_inline]] nce uint32x2_t multiply_add_lane(uint32x2_t a, uint32x2_t b, uint32x4_t v) { return vmla_laneq_u32(a, b, v, lane); }
640template <int lane>[[gnu::always_inline]] nce uint32x2_t multiply_subtract_lane(uint32x2_t a, uint32x2_t b, uint32x4_t v) { return vmls_laneq_u32(a, b, v, lane); }
641[[gnu::always_inline]] nce uint32x2_t zip1(uint32x2_t a, uint32x2_t b) { return vzip1_u32(a, b); }
642[[gnu::always_inline]] nce uint32x2_t zip2(uint32x2_t a, uint32x2_t b) { return vzip2_u32(a, b); }
643[[gnu::always_inline]] nce uint32x2_t unzip1(uint32x2_t a, uint32x2_t b) { return vuzp1_u32(a, b); }
644[[gnu::always_inline]] nce uint32x2_t unzip2(uint32x2_t a, uint32x2_t b) { return vuzp2_u32(a, b); }
645[[gnu::always_inline]] nce uint32x2_t transpose_step_1(uint32x2_t a, uint32x2_t b) { return vtrn1_u32(a, b); }
646[[gnu::always_inline]] nce uint32x2_t transpose_step_2(uint32x2_t a, uint32x2_t b) { return vtrn2_u32(a, b); }
647template <int lane>[[gnu::always_inline]] nce uint32x2_t multiply_lane(uint32x2_t a, uint32x4_t v) { return vmul_laneq_u32(a, v, lane); }
648template <int lane>[[gnu::always_inline]] nce uint64x2_t multiply_long_lane(uint32x2_t a, uint32x4_t v) { return vmull_laneq_u32(a, v, lane); }
649template <int n>[[gnu::always_inline]] nce uint32x4_t shift_right_saturate_narrow_unsigned_high(uint32x2_t r, int64x2_t a) { return vqshrun_high_n_s64(r, a, n); }
650template <int n>[[gnu::always_inline]] nce uint32x4_t shift_right_unsigned_saturate_narrow_high(uint32x2_t r, int64x2_t a) { return vqrshrun_high_n_s64(r, a, n); }
651[[gnu::always_inline]] nce uint32x4_t move_unsigned_saturate_narrow_high(uint32x2_t r, int64x2_t a) { return vqmovun_high_s64(r, a); }
652template <int lane1, int lane2>[[gnu::always_inline]] nce uint32x2_t copy_lane(uint32x2_t a, uint32x2_t b) { return vcopy_lane_u32(a, lane1, b, lane2); }
653template <int lane1, int lane2>[[gnu::always_inline]] nce uint32x2_t copy_lane(uint32x2_t a, uint32x4_t b) { return vcopy_laneq_u32(a, lane1, b, lane2); }
654[[gnu::always_inline]] nce uint32x4_t add_high(uint32x4_t a, uint16x8_t b) { return vaddw_high_u16(a, b); }
655[[gnu::always_inline]] nce uint32x4_t multiply_add_long_high(uint32x4_t a, uint16x8_t b, uint16x8_t c) { return vmlal_high_u16(a, b, c); }
656[[gnu::always_inline]] nce uint32x4_t multiply_subtract_long_high(uint32x4_t a, uint16x8_t b, uint16x8_t c) { return vmlsl_high_u16(a, b, c); }
657[[gnu::always_inline]] nce uint32x4_t subtract_high(uint32x4_t a, uint16x8_t b) { return vsubw_high_u16(a, b); }
658[[gnu::always_inline]] nce uint32x4_t subtract_absolute_add_high(uint32x4_t a, uint16x8_t b, uint16x8_t c) { return vabal_high_u16(a, b, c); }
659[[gnu::always_inline]] nce uint32x4_t add_saturate(uint32x4_t a, int32x4_t b) { return vsqaddq_u32(a, b); }
660[[gnu::always_inline]] nce uint64x2_t add_long_high(uint32x4_t a, uint32x4_t b) { return vaddl_high_u32(a, b); }
661[[gnu::always_inline]] nce uint64x2_t multiply_long_high(uint32x4_t a, uint32x4_t b) { return vmull_high_u32(a, b); }
662[[gnu::always_inline]] nce uint64x2_t subtract_long_high(uint32x4_t a, uint32x4_t b) { return vsubl_high_u32(a, b); }
663[[gnu::always_inline]] nce uint64x2_t subtract_absolute_long_high(uint32x4_t a, uint32x4_t b) { return vabdl_high_u32(a, b); }
664[[gnu::always_inline]] nce uint32x4_t pairwise_add(uint32x4_t a, uint32x4_t b) { return vpaddq_u32(a, b); }
665[[gnu::always_inline]] nce uint32x4_t pairwise_max(uint32x4_t a, uint32x4_t b) { return vpmaxq_u32(a, b); }
666[[gnu::always_inline]] nce uint32x4_t pairwise_min(uint32x4_t a, uint32x4_t b) { return vpminq_u32(a, b); }
667[[gnu::always_inline]] nce uint32_t reduce_add(uint32x4_t a) { return vaddvq_u32(a); }
668[[gnu::always_inline]] nce uint64_t reduce_add_long(uint32x4_t a) { return vaddlvq_u32(a); }
669[[gnu::always_inline]] inline uint32_t reduce_max(uint32x4_t a) { return vmaxvq_u32(a); }
670[[gnu::always_inline]] inline uint32_t reduce_min(uint32x4_t a) { return vminvq_u32(a); }
671[[gnu::always_inline]] nce uint32x4_t equal_to_zero(uint32x4_t a) { return vceqzq_u32(a); }
672template <int n>[[gnu::always_inline]] nce uint64x2_t shift_left_long_high(uint32x4_t a) { return vshll_high_n_u32(a, n); }
673template <> [[gnu::always_inline]] nce float64x2_t reinterpret(uint32x4_t a) { return vreinterpretq_f64_u32(a); }
674[[gnu::always_inline]] nce uint64x2_t move_long_high(uint32x4_t a) { return vmovl_high_u32(a); }
675template <int lane>[[gnu::always_inline]] nce uint32x4_t multiply_add_long_lane(uint32x4_t a, uint16x4_t b, uint16x8_t v) { return vmlal_laneq_u16(a, b, v, lane); }
676template <int lane>[[gnu::always_inline]] nce uint32x4_t multiply_subtract_long_lane(uint32x4_t a, uint16x4_t b, uint16x8_t v) { return vmlsl_laneq_u16(a, b, v, lane); }
677template <int lane>[[gnu::always_inline]] nce uint32x4_t multiply_add_long_lane_high(uint32x4_t a, uint16x8_t b, uint16x4_t v) { return vmlal_high_lane_u16(a, b, v, lane); }
678template <int lane>[[gnu::always_inline]] nce uint32x4_t multiply_subtract_long_lane_high(uint32x4_t a, uint16x8_t b, uint16x4_t v) { return vmlsl_high_lane_u16(a, b, v, lane); }
679template <int lane>[[gnu::always_inline]] nce uint32x4_t multiply_add_long_lane_high(uint32x4_t a, uint16x8_t b, uint16x8_t v) { return vmlal_high_laneq_u16(a, b, v, lane); }
680template <int lane>[[gnu::always_inline]] nce uint32x4_t multiply_subtract_long_lane_high(uint32x4_t a, uint16x8_t b, uint16x8_t v) { return vmlsl_high_laneq_u16(a, b, v, lane); }
681[[gnu::always_inline]] nce uint32x4_t multiply_add_long_high(uint32x4_t a, uint16x8_t b, uint16_t c) { return vmlal_high_n_u16(a, b, c); }
682[[gnu::always_inline]] nce uint32x4_t multiply_subtract_long_high(uint32x4_t a, uint16x8_t b, uint16_t c) { return vmlsl_high_n_u16(a, b, c); }
683template <int lane>[[gnu::always_inline]] nce uint64x2_t multiply_long_lane_high(uint32x4_t a, uint32x2_t v) { return vmull_high_lane_u32(a, v, lane); }
684template <int lane>[[gnu::always_inline]] nce uint32x4_t multiply_add_lane(uint32x4_t a, uint32x4_t b, uint32x4_t v) { return vmlaq_laneq_u32(a, b, v, lane); }
685template <int lane>[[gnu::always_inline]] nce uint32x4_t multiply_subtract_lane(uint32x4_t a, uint32x4_t b, uint32x4_t v) { return vmlsq_laneq_u32(a, b, v, lane); }
686template <int lane>[[gnu::always_inline]] nce uint32x4_t multiply_lane(uint32x4_t a, uint32x4_t v) { return vmulq_laneq_u32(a, v, lane); }
687template <int lane>[[gnu::always_inline]] nce uint64x2_t multiply_long_lane_high(uint32x4_t a, uint32x4_t v) { return vmull_high_laneq_u32(a, v, lane); }
688template <int lane>[[gnu::always_inline]] nce uint32x2_t duplicate_lane(uint32x4_t vec) { return vdup_laneq_u32(vec, lane); }
689template <int lane>[[gnu::always_inline]] nce uint32x4_t duplicate_lane_quad(uint32x4_t vec) { return vdupq_laneq_u32(vec, lane); }
690[[gnu::always_inline]] nce uint32x4_t zip1(uint32x4_t a, uint32x4_t b) { return vzip1q_u32(a, b); }
691[[gnu::always_inline]] nce uint32x4_t zip2(uint32x4_t a, uint32x4_t b) { return vzip2q_u32(a, b); }
692[[gnu::always_inline]] nce uint32x4_t unzip1(uint32x4_t a, uint32x4_t b) { return vuzp1q_u32(a, b); }
693[[gnu::always_inline]] nce uint32x4_t unzip2(uint32x4_t a, uint32x4_t b) { return vuzp2q_u32(a, b); }
694[[gnu::always_inline]] nce uint32x4_t transpose_step_1(uint32x4_t a, uint32x4_t b) { return vtrn1q_u32(a, b); }
695[[gnu::always_inline]] nce uint32x4_t transpose_step_2(uint32x4_t a, uint32x4_t b) { return vtrn2q_u32(a, b); }
696
697
698
699
700[[gnu::always_inline]] nce uint64x2_t multiply_long_high(uint32x4_t a, uint32_t b) { return vmull_high_n_u32(a, b); }
701template <int lane1, int lane2>[[gnu::always_inline]] nce uint32x4_t copy_lane(uint32x4_t a, uint32x2_t b) { return vcopyq_lane_u32(a, lane1, b, lane2); }
702template <int lane1, int lane2>[[gnu::always_inline]] nce uint32x4_t copy_lane(uint32x4_t a, uint32x4_t b) { return vcopyq_laneq_u32(a, lane1, b, lane2); }
703[[gnu::always_inline]] nce float16x8_t convert_high(float16x4_t r, float32x4_t a) { return vcvt_high_f16_f32(r, a); }
704template <> [[gnu::always_inline]] nce float64x1_t reinterpret(float16x4_t a) { return vreinterpret_f64_f16(a); }
705[[gnu::always_inline]] nce float16x4_t round_using_current_mode(float16x4_t a) { return vrndi_f16(a); }
706[[gnu::always_inline]] nce float16x4_t square_root(float16x4_t a) { return vsqrt_f16(a); }
707[[gnu::always_inline]] nce float16x4_t divide(float16x4_t a, float16x4_t b) { return vdiv_f16(a, b); }
708[[gnu::always_inline]] inline float16_t reduce_max(float16x4_t a) { return vmaxv_f16(a); }
709[[gnu::always_inline]] inline float16_t reduce_max_strict(float16x4_t a) { return vmaxnmv_f16(a); }
710[[gnu::always_inline]] inline float16_t reduce_min(float16x4_t a) { return vminv_f16(a); }
711[[gnu::always_inline]] inline float16_t reduce_min_strict(float16x4_t a) { return vminnmv_f16(a); }
712[[gnu::always_inline]] nce float16x4_t multiply_extended(float16x4_t a, float16x4_t b) { return vmulx_f16(a, b); }
713template <int lane>[[gnu::always_inline]] nce float16x4_t multiply_extended_lane(float16x4_t a, float16x4_t v) { return vmulx_lane_f16(a, v, lane); }
714template <int lane>[[gnu::always_inline]] nce float16x4_t multiply_add_fused_lane(float16x4_t a, float16x4_t b, float16x4_t v) { return vfma_lane_f16(a, b, v, lane); }
715template <int lane>[[gnu::always_inline]] nce float16x4_t multiply_subtract_fused_lane(float16x4_t a, float16x4_t b, float16x4_t v) { return vfms_lane_f16(a, b, v, lane); }
716template <int lane>[[gnu::always_inline]] nce float16x4_t multiply_add_fused_lane(float16x4_t a, float16x4_t b, float16x8_t v) { return vfma_laneq_f16(a, b, v, lane); }
717template <int lane>[[gnu::always_inline]] nce float16x4_t multiply_subtract_fused_lane(float16x4_t a, float16x4_t b, float16x8_t v) { return vfms_laneq_f16(a, b, v, lane); }
718[[gnu::always_inline]] nce float16x4_t multiply_add_fused(float16x4_t a, float16x4_t b, float16_t n) { return vfma_n_f16(a, b, n); }
719[[gnu::always_inline]] nce float16x4_t multiply_subtract_fused(float16x4_t a, float16x4_t b, float16_t n) { return vfms_n_f16(a, b, n); }
720[[gnu::always_inline]] nce float16x4_t pairwise_max_strict(float16x4_t a, float16x4_t b) { return vpmaxnm_f16(a, b); }
721[[gnu::always_inline]] nce float16x4_t pairwise_min_strict(float16x4_t a, float16x4_t b) { return vpminnm_f16(a, b); }
722[[gnu::always_inline]] nce float16x4_t zip1(float16x4_t a, float16x4_t b) { return vzip1_f16(a, b); }
723[[gnu::always_inline]] nce float16x4_t zip2(float16x4_t a, float16x4_t b) { return vzip2_f16(a, b); }
724[[gnu::always_inline]] nce float16x4_t unzip1(float16x4_t a, float16x4_t b) { return vuzp1_f16(a, b); }
725[[gnu::always_inline]] nce float16x4_t unzip2(float16x4_t a, float16x4_t b) { return vuzp2_f16(a, b); }
726[[gnu::always_inline]] nce float16x4_t transpose_step_1(float16x4_t a, float16x4_t b) { return vtrn1_f16(a, b); }
727[[gnu::always_inline]] nce float16x4_t transpose_step_2(float16x4_t a, float16x4_t b) { return vtrn2_f16(a, b); }
728template <int lane>[[gnu::always_inline]] nce float16x4_t multiply_lane(float16x4_t a, float16x8_t v) { return vmul_laneq_f16(a, v, lane); }
729template <int lane>[[gnu::always_inline]] nce float16x4_t multiply_extended_lane(float16x4_t a, float16x8_t v) { return vmulx_laneq_f16(a, v, lane); }
730[[gnu::always_inline]] nce float16x4_t multiply_extended(float16x4_t a, float16_t n) { return vmulx_n_f16(a, n); }
731[[gnu::always_inline]] nce float32x4_t convert_high(float16x8_t a) { return vcvt_high_f32_f16(a); }
732template <> [[gnu::always_inline]] nce float64x2_t reinterpret(float16x8_t a) { return vreinterpretq_f64_f16(a); }
733[[gnu::always_inline]] nce float16x8_t round_using_current_mode(float16x8_t a) { return vrndiq_f16(a); }
734[[gnu::always_inline]] nce float16x8_t square_root(float16x8_t a) { return vsqrtq_f16(a); }
735[[gnu::always_inline]] nce float16x8_t divide(float16x8_t a, float16x8_t b) { return vdivq_f16(a, b); }
736[[gnu::always_inline]] inline float16_t reduce_max(float16x8_t a) { return vmaxvq_f16(a); }
737[[gnu::always_inline]] inline float16_t reduce_max_strict(float16x8_t a) { return vmaxnmvq_f16(a); }
738[[gnu::always_inline]] inline float16_t reduce_min(float16x8_t a) { return vminvq_f16(a); }
739[[gnu::always_inline]] inline float16_t reduce_min_strict(float16x8_t a) { return vminnmvq_f16(a); }
740template <int lane>[[gnu::always_inline]] nce float16x8_t multiply_extended_lane(float16x8_t a, float16x4_t v) { return vmulxq_lane_f16(a, v, lane); }
741template <int lane>[[gnu::always_inline]] nce float16x8_t multiply_lane(float16x8_t a, float16x8_t v) { return vmulq_laneq_f16(a, v, lane); }
742[[gnu::always_inline]] nce float16x8_t multiply_extended(float16x8_t a, float16x8_t b) { return vmulxq_f16(a, b); }
743template <int lane>[[gnu::always_inline]] nce float16x8_t multiply_extended_lane(float16x8_t a, float16x8_t v) { return vmulxq_laneq_f16(a, v, lane); }
744template <int lane>[[gnu::always_inline]] nce float16x8_t multiply_add_fused_lane(float16x8_t a, float16x8_t b, float16x4_t v) { return vfmaq_lane_f16(a, b, v, lane); }
745template <int lane>[[gnu::always_inline]] nce float16x8_t multiply_subtract_fused_lane(float16x8_t a, float16x8_t b, float16x4_t v) { return vfmsq_lane_f16(a, b, v, lane); }
746template <int lane>[[gnu::always_inline]] nce float16x8_t multiply_add_fused_lane(float16x8_t a, float16x8_t b, float16x8_t v) { return vfmaq_laneq_f16(a, b, v, lane); }
747template <int lane>[[gnu::always_inline]] nce float16x8_t multiply_subtract_fused_lane(float16x8_t a, float16x8_t b, float16x8_t v) { return vfmsq_laneq_f16(a, b, v, lane); }
748[[gnu::always_inline]] nce float16x8_t multiply_add_fused(float16x8_t a, float16x8_t b, float16_t n) { return vfmaq_n_f16(a, b, n); }
749[[gnu::always_inline]] nce float16x8_t multiply_subtract_fused(float16x8_t a, float16x8_t b, float16_t n) { return vfmsq_n_f16(a, b, n); }
750[[gnu::always_inline]] nce float16x8_t pairwise_add(float16x8_t a, float16x8_t b) { return vpaddq_f16(a, b); }
751[[gnu::always_inline]] nce float16x8_t pairwise_max(float16x8_t a, float16x8_t b) { return vpmaxq_f16(a, b); }
752[[gnu::always_inline]] nce float16x8_t pairwise_max_strict(float16x8_t a, float16x8_t b) { return vpmaxnmq_f16(a, b); }
753[[gnu::always_inline]] nce float16x8_t pairwise_min(float16x8_t a, float16x8_t b) { return vpminq_f16(a, b); }
754[[gnu::always_inline]] nce float16x8_t pairwise_min_strict(float16x8_t a, float16x8_t b) { return vpminnmq_f16(a, b); }
755[[gnu::always_inline]] nce float16x8_t zip1(float16x8_t a, float16x8_t b) { return vzip1q_f16(a, b); }
756[[gnu::always_inline]] nce float16x8_t zip2(float16x8_t a, float16x8_t b) { return vzip2q_f16(a, b); }
757[[gnu::always_inline]] nce float16x8_t unzip1(float16x8_t a, float16x8_t b) { return vuzp1q_f16(a, b); }
758[[gnu::always_inline]] nce float16x8_t unzip2(float16x8_t a, float16x8_t b) { return vuzp2q_f16(a, b); }
759[[gnu::always_inline]] nce float16x8_t transpose_step_1(float16x8_t a, float16x8_t b) { return vtrn1q_f16(a, b); }
760[[gnu::always_inline]] nce float16x8_t transpose_step_2(float16x8_t a, float16x8_t b) { return vtrn2q_f16(a, b); }
761[[gnu::always_inline]] nce float16x8_t multiply_extended(float16x8_t a, float16_t n) { return vmulxq_n_f16(a, n); }
762template <int lane>[[gnu::always_inline]] nce float16x4_t duplicate_lane(float16x8_t vec) { return vdup_laneq_f16(vec, lane); }
763template <int lane>[[gnu::always_inline]] nce float16x8_t duplicate_lane(float16x8_t vec) { return vdupq_laneq_f16(vec, lane); }
764[[gnu::always_inline]] nce float32x2_t multiply_extended(float32x2_t a, float32x2_t b) { return vmulx_f32(a, b); }
765template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_extended_lane(float32x2_t a, float32x2_t v) { return vmulx_lane_f32(a, v, lane); }
766template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_add_fused_lane(float32x2_t a, float32x2_t b, float32x2_t v) { return vfma_lane_f32(a, b, v, lane); }
767template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_subtract_fused_lane(float32x2_t a, float32x2_t b, float32x2_t v) { return vfms_lane_f32(a, b, v, lane); }
768template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_add_fused_lane(float32x2_t a, float32x2_t b, float32x4_t v) { return vfma_laneq_f32(a, b, v, lane); }
769template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_subtract_fused_lane(float32x2_t a, float32x2_t b, float32x4_t v) { return vfms_laneq_f32(a, b, v, lane); }
770[[gnu::always_inline]] nce float32x2_t divide(float32x2_t a, float32x2_t b) { return vdiv_f32(a, b); }
771[[gnu::always_inline]] nce float32x2_t square_root(float32x2_t a) { return vsqrt_f32(a); }
772[[gnu::always_inline]] nce float32_t pairwise_add(float32x2_t a) { return vpadds_f32(a); }
773[[gnu::always_inline]] nce float32_t pairwise_max(float32x2_t a) { return vpmaxs_f32(a); }
774[[gnu::always_inline]] nce float32_t pairwise_max_strict(float32x2_t a) { return vpmaxnms_f32(a); }
775[[gnu::always_inline]] nce float32_t pairwise_min(float32x2_t a) { return vpmins_f32(a); }
776[[gnu::always_inline]] nce float32x2_t pairwise_max_strict(float32x2_t a, float32x2_t b) { return vpmaxnm_f32(a, b); }
777[[gnu::always_inline]] nce float32x2_t pairwise_min_strict(float32x2_t a, float32x2_t b) { return vpminnm_f32(a, b); }
778template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_extended_lane(float32x2_t a, float32x4_t v) { return vmulx_laneq_f32(a, v, lane); }
779[[gnu::always_inline]] nce float32_t pairwise_min_strict(float32x2_t a) { return vpminnms_f32(a); }
780[[gnu::always_inline]] nce float32_t reduce_add(float32x2_t a) { return vaddv_f32(a); }
781[[gnu::always_inline]] inline float32_t reduce_max(float32x2_t a) { return vmaxv_f32(a); }
782[[gnu::always_inline]] inline float32_t reduce_min(float32x2_t a) { return vminv_f32(a); }
783[[gnu::always_inline]] inline float32_t reduce_max_strict(float32x2_t a) { return vmaxnmv_f32(a); }
784[[gnu::always_inline]] inline float32_t reduce_min_strict(float32x2_t a) { return vminnmv_f32(a); }
785[[gnu::always_inline]] nce uint32x2_t equal_to_zero(float32x2_t a) { return vceqz_f32(a); }
786[[gnu::always_inline]] nce uint32x2_t greater_than_or_equal_to_zero(float32x2_t a) { return vcgez_f32(a); }
787[[gnu::always_inline]] nce uint32x2_t less_than_or_equal_to_zero(float32x2_t a) { return vclez_f32(a); }
788[[gnu::always_inline]] nce uint32x2_t greater_than_zero(float32x2_t a) { return vcgtz_f32(a); }
789[[gnu::always_inline]] nce uint32x2_t less_than_zero(float32x2_t a) { return vcltz_f32(a); }
790[[gnu::always_inline]] nce float32x4_t convert_high(float32x2_t r, float64x2_t a) { return vcvt_high_f32_f64(r, a); }
791[[gnu::always_inline]] nce float64x2_t convert(float32x2_t a) { return vcvt_f64_f32(a); }
792[[gnu::always_inline]] nce float32x4_t convert_round_to_odd_high(float32x2_t r, float64x2_t a) { return vcvtx_high_f32_f64(r, a); }
793template <> [[gnu::always_inline]] nce float64x1_t reinterpret(float32x2_t a) { return vreinterpret_f64_f32(a); }
794template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_add_lane(float32x2_t a, float32x2_t b, float32x4_t v) { return vmla_laneq_f32(a, b, v, lane); }
795template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_subtract_lane(float32x2_t a, float32x2_t b, float32x4_t v) { return vmls_laneq_f32(a, b, v, lane); }
796[[gnu::always_inline]] nce float32x2_t round_to_32bit_integer(float32x2_t a) { return vrnd32z_f32(a); }
797[[gnu::always_inline]] nce float32x2_t round_to_64bit_integer(float32x2_t a) { return vrnd64z_f32(a); }
798[[gnu::always_inline]] nce float32x2_t round_to_32bit_integer_using_current_mode(float32x2_t a) { return vrnd32x_f32(a); }
799[[gnu::always_inline]] nce float32x2_t round_to_64bit_integer_using_current_mode(float32x2_t a) { return vrnd64x_f32(a); }
800[[gnu::always_inline]] nce float32x2_t zip1(float32x2_t a, float32x2_t b) { return vzip1_f32(a, b); }
801[[gnu::always_inline]] nce float32x2_t zip2(float32x2_t a, float32x2_t b) { return vzip2_f32(a, b); }
802[[gnu::always_inline]] nce float32x2_t unzip1(float32x2_t a, float32x2_t b) { return vuzp1_f32(a, b); }
803[[gnu::always_inline]] nce float32x2_t unzip2(float32x2_t a, float32x2_t b) { return vuzp2_f32(a, b); }
804[[gnu::always_inline]] nce float32x2_t transpose_step_1(float32x2_t a, float32x2_t b) { return vtrn1_f32(a, b); }
805[[gnu::always_inline]] nce float32x2_t transpose_step_2(float32x2_t a, float32x2_t b) { return vtrn2_f32(a, b); }
806template <int lane>[[gnu::always_inline]] nce float32x2_t multiply_lane(float32x2_t a, float32x4_t v) { return vmul_laneq_f32(a, v, lane); }
807template <int lane1, int lane2>[[gnu::always_inline]] nce float32x2_t copy_lane(float32x2_t a, float32x2_t b) { return vcopy_lane_f32(a, lane1, b, lane2); }
808template <int lane1, int lane2>[[gnu::always_inline]] nce float32x2_t copy_lane(float32x2_t a, float32x4_t b) { return vcopy_laneq_f32(a, lane1, b, lane2); }
809template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_extended_lane(float32x4_t a, float32x2_t v) { return vmulxq_lane_f32(a, v, lane); }
810[[gnu::always_inline]] nce float32x4_t multiply_extended(float32x4_t a, float32x4_t b) { return vmulxq_f32(a, b); }
811template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_extended_lane(float32x4_t a, float32x4_t v) { return vmulxq_laneq_f32(a, v, lane); }
812template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_add_fused_lane(float32x4_t a, float32x4_t b, float32x2_t v) { return vfmaq_lane_f32(a, b, v, lane); }
813template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_subtract_fused_lane(float32x4_t a, float32x4_t b, float32x2_t v) { return vfmsq_lane_f32(a, b, v, lane); }
814template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_add_fused_lane(float32x4_t a, float32x4_t b, float32x4_t v) { return vfmaq_laneq_f32(a, b, v, lane); }
815template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_subtract_fused_lane(float32x4_t a, float32x4_t b, float32x4_t v) { return vfmsq_laneq_f32(a, b, v, lane); }
816[[gnu::always_inline]] nce float32x4_t divide(float32x4_t a, float32x4_t b) { return vdivq_f32(a, b); }
817[[gnu::always_inline]] nce float32x4_t square_root(float32x4_t a) { return vsqrtq_f32(a); }
818[[gnu::always_inline]] nce float32x4_t pairwise_add(float32x4_t a, float32x4_t b) { return vpaddq_f32(a, b); }
819[[gnu::always_inline]] nce float32x4_t pairwise_max(float32x4_t a, float32x4_t b) { return vpmaxq_f32(a, b); }
820[[gnu::always_inline]] nce float32x4_t pairwise_min(float32x4_t a, float32x4_t b) { return vpminq_f32(a, b); }
821[[gnu::always_inline]] nce float32x4_t pairwise_max_strict(float32x4_t a, float32x4_t b) { return vpmaxnmq_f32(a, b); }
822[[gnu::always_inline]] nce float32x4_t pairwise_min_strict(float32x4_t a, float32x4_t b) { return vpminnmq_f32(a, b); }
823[[gnu::always_inline]] nce float32_t reduce_add(float32x4_t a) { return vaddvq_f32(a); }
824[[gnu::always_inline]] inline float32_t reduce_max(float32x4_t a) { return vmaxvq_f32(a); }
825[[gnu::always_inline]] inline float32_t reduce_min(float32x4_t a) { return vminvq_f32(a); }
826[[gnu::always_inline]] inline float32_t reduce_max_strict(float32x4_t a) { return vmaxnmvq_f32(a); }
827[[gnu::always_inline]] inline float32_t reduce_min_strict(float32x4_t a) { return vminnmvq_f32(a); }
828[[gnu::always_inline]] nce uint32x4_t equal_to_zero(float32x4_t a) { return vceqzq_f32(a); }
829[[gnu::always_inline]] nce uint32x4_t greater_than_or_equal_to_zero(float32x4_t a) { return vcgezq_f32(a); }
830[[gnu::always_inline]] nce uint32x4_t less_than_or_equal_to_zero(float32x4_t a) { return vclezq_f32(a); }
831[[gnu::always_inline]] nce uint32x4_t greater_than_zero(float32x4_t a) { return vcgtzq_f32(a); }
832[[gnu::always_inline]] nce uint32x4_t less_than_zero(float32x4_t a) { return vcltzq_f32(a); }
833[[gnu::always_inline]] nce float64x2_t convert_high(float32x4_t a) { return vcvt_high_f64_f32(a); }
834template <> [[gnu::always_inline]] nce float64x2_t reinterpret(float32x4_t a) { return vreinterpretq_f64_f32(a); }
835template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_lane(float32x4_t a, float32x4_t v) { return vmulq_laneq_f32(a, v, lane); }
836template <int lane>[[gnu::always_inline]] nce float32x2_t duplicate_lane(float32x4_t vec) { return vdup_laneq_f32(vec, lane); }
837template <int lane>[[gnu::always_inline]] nce float32x4_t duplicate_lane_quad(float32x4_t vec) { return vdupq_laneq_f32(vec, lane); }
838[[gnu::always_inline]] nce float32x4_t round_to_32bit_integer(float32x4_t a) { return vrnd32zq_f32(a); }
839[[gnu::always_inline]] nce float32x4_t round_to_64bit_integer(float32x4_t a) { return vrnd64zq_f32(a); }
840[[gnu::always_inline]] nce float32x4_t round_to_32bit_integer_using_current_mode(float32x4_t a) { return vrnd32xq_f32(a); }
841[[gnu::always_inline]] nce float32x4_t round_to_64bit_integer_using_current_mode(float32x4_t a) { return vrnd64xq_f32(a); }
842template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_add_lane(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlaq_laneq_f32(a, b, v, lane); }
843template <int lane>[[gnu::always_inline]] nce float32x4_t multiply_subtract_lane(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlsq_laneq_f32(a, b, v, lane); }
844[[gnu::always_inline]] nce float32x4_t zip1(float32x4_t a, float32x4_t b) { return vzip1q_f32(a, b); }
845[[gnu::always_inline]] nce float32x4_t zip2(float32x4_t a, float32x4_t b) { return vzip2q_f32(a, b); }
846[[gnu::always_inline]] nce float32x4_t unzip1(float32x4_t a, float32x4_t b) { return vuzp1q_f32(a, b); }
847[[gnu::always_inline]] nce float32x4_t unzip2(float32x4_t a, float32x4_t b) { return vuzp2q_f32(a, b); }
848[[gnu::always_inline]] nce float32x4_t transpose_step_1(float32x4_t a, float32x4_t b) { return vtrn1q_f32(a, b); }
849[[gnu::always_inline]] nce float32x4_t transpose_step_2(float32x4_t a, float32x4_t b) { return vtrn2q_f32(a, b); }
850template <int lane1, int lane2>[[gnu::always_inline]] nce float32x4_t copy_lane(float32x4_t a, float32x2_t b) { return vcopyq_lane_f32(a, lane1, b, lane2); }
851template <int lane1, int lane2>[[gnu::always_inline]] nce float32x4_t copy_lane(float32x4_t a, float32x4_t b) { return vcopyq_laneq_f32(a, lane1, b, lane2); }
852[[gnu::always_inline]] nce float64x1_t add(float64x1_t a, float64x1_t b) { return vadd_f64(a, b); }
853[[gnu::always_inline]] nce float64x1_t multiply(float64x1_t a, float64x1_t b) { return vmul_f64(a, b); }
854[[gnu::always_inline]] nce float64x1_t multiply_extended(float64x1_t a, float64x1_t b) { return vmulx_f64(a, b); }
855template <int lane>[[gnu::always_inline]] nce float64x1_t multiply_extended_lane(float64x1_t a, float64x1_t v) { return vmulx_lane_f64(a, v, lane); }
856[[gnu::always_inline]] nce float64x1_t multiply_add(float64x1_t a, float64x1_t b, float64x1_t c) { return vmla_f64(a, b, c); }
857[[gnu::always_inline]] nce float64x1_t multiply_subtract(float64x1_t a, float64x1_t b, float64x1_t c) { return vmls_f64(a, b, c); }
858[[gnu::always_inline]] nce float64x1_t multiply_add_fused(float64x1_t a, float64x1_t b, float64x1_t c) { return vfma_f64(a, b, c); }
859template <int lane>[[gnu::always_inline]] nce float64x1_t multiply_add_fused_lane(float64x1_t a, float64x1_t b, float64x1_t v) { return vfma_lane_f64(a, b, v, lane); }
860[[gnu::always_inline]] nce float64x1_t multiply_subtract_fused(float64x1_t a, float64x1_t b, float64x1_t c) { return vfms_f64(a, b, c); }
861template <int lane>[[gnu::always_inline]] nce float64x1_t multiply_subtract_fused_lane(float64x1_t a, float64x1_t b, float64x1_t v) { return vfms_lane_f64(a, b, v, lane); }
862template <int lane>[[gnu::always_inline]] nce float64x1_t multiply_add_fused_lane(float64x1_t a, float64x1_t b, float64x2_t v) { return vfma_laneq_f64(a, b, v, lane); }
863template <int lane>[[gnu::always_inline]] nce float64x1_t multiply_subtract_fused_lane(float64x1_t a, float64x1_t b, float64x2_t v) { return vfms_laneq_f64(a, b, v, lane); }
864[[gnu::always_inline]] nce float64x1_t divide(float64x1_t a, float64x1_t b) { return vdiv_f64(a, b); }
865[[gnu::always_inline]] nce float64x1_t subtract(float64x1_t a, float64x1_t b) { return vsub_f64(a, b); }
866[[gnu::always_inline]] nce float64x1_t subtract_absolute(float64x1_t a, float64x1_t b) { return vabd_f64(a, b); }
867[[gnu::always_inline]] nce float64x1_t absolute(float64x1_t a) { return vabs_f64(a); }
868template <> [[gnu::always_inline]] nce float64x1_t max(float64x1_t a, float64x1_t b) { return vmax_f64(a, b); }
869template <> [[gnu::always_inline]] nce float64x1_t min(float64x1_t a, float64x1_t b) { return vmin_f64(a, b); }
870template <> [[gnu::always_inline]] nce float64x1_t max_strict(float64x1_t a, float64x1_t b) { return vmaxnm_f64(a, b); }
871template <> [[gnu::always_inline]] nce float64x1_t min_strict(float64x1_t a, float64x1_t b) { return vminnm_f64(a, b); }
872[[gnu::always_inline]] nce float64x1_t round(float64x1_t a) { return vrnd_f64(a); }
873[[gnu::always_inline]] nce float64x1_t round_toward_negative_infinity(float64x1_t a) { return vrndm_f64(a); }
874[[gnu::always_inline]] nce float64x1_t round_toward_positive_infinity(float64x1_t a) { return vrndp_f64(a); }
875[[gnu::always_inline]] nce float64x1_t round_to_nearest_with_ties_away_from_zero(float64x1_t a) { return vrnda_f64(a); }
876[[gnu::always_inline]] nce float64x1_t round_using_current_mode(float64x1_t a) { return vrndi_f64(a); }
877[[gnu::always_inline]] nce float64x1_t round_inexact(float64x1_t a) { return vrndx_f64(a); }
878[[gnu::always_inline]] nce float64x1_t reciprocal_estimate(float64x1_t a) { return vrecpe_f64(a); }
879[[gnu::always_inline]] nce float64x1_t reciprocal_step(float64x1_t a, float64x1_t b) { return vrecps_f64(a, b); }
880[[gnu::always_inline]] nce float64x1_t reciprocal_sqrt_estimate(float64x1_t a) { return vrsqrte_f64(a); }
881[[gnu::always_inline]] nce float64x1_t reciprocal_sqrt_step(float64x1_t a, float64x1_t b) { return vrsqrts_f64(a, b); }
882[[gnu::always_inline]] nce float64x1_t square_root(float64x1_t a) { return vsqrt_f64(a); }
883[[gnu::always_inline]] nce uint64x1_t equal(float64x1_t a, float64x1_t b) { return vceq_f64(a, b); }
884template <int lane>[[gnu::always_inline]] nce float64x1_t multiply_extended_lane(float64x1_t a, float64x2_t v) { return vmulx_laneq_f64(a, v, lane); }
885[[gnu::always_inline]] nce uint64x1_t equal_to_zero(float64x1_t a) { return vceqz_f64(a); }
886[[gnu::always_inline]] nce uint64x1_t greater_than_or_equal(float64x1_t a, float64x1_t b) { return vcge_f64(a, b); }
887[[gnu::always_inline]] nce uint64x1_t greater_than_or_equal_to_zero(float64x1_t a) { return vcgez_f64(a); }
888[[gnu::always_inline]] nce uint64x1_t less_than_or_equal(float64x1_t a, float64x1_t b) { return vcle_f64(a, b); }
889[[gnu::always_inline]] nce uint64x1_t less_than_or_equal_to_zero(float64x1_t a) { return vclez_f64(a); }
890[[gnu::always_inline]] nce uint64x1_t greater_than(float64x1_t a, float64x1_t b) { return vcgt_f64(a, b); }
891[[gnu::always_inline]] nce uint64x1_t greater_than_zero(float64x1_t a) { return vcgtz_f64(a); }
892[[gnu::always_inline]] nce uint64x1_t less_than(float64x1_t a, float64x1_t b) { return vclt_f64(a, b); }
893[[gnu::always_inline]] nce uint64x1_t less_than_zero(float64x1_t a) { return vcltz_f64(a); }
894[[gnu::always_inline]] nce uint64x1_t absolute_greater_than_or_equal(float64x1_t a, float64x1_t b) { return vcage_f64(a, b); }
895[[gnu::always_inline]] nce uint64x1_t absolute_less_than_or_equal(float64x1_t a, float64x1_t b) { return vcale_f64(a, b); }
896[[gnu::always_inline]] nce uint64x1_t absolute_greater_than(float64x1_t a, float64x1_t b) { return vcagt_f64(a, b); }
897[[gnu::always_inline]] nce uint64x1_t absolute_less_than(float64x1_t a, float64x1_t b) { return vcalt_f64(a, b); }
898template <> [[gnu::always_inline]] nce int64x1_t convert(float64x1_t a) { return vcvt_s64_f64(a); }
899template <> [[gnu::always_inline]] nce uint64x1_t convert(float64x1_t a) { return vcvt_u64_f64(a); }
900template <> [[gnu::always_inline]] nce int64x1_t convert_round_to_nearest_with_ties_to_even(float64x1_t a) { return vcvtn_s64_f64(a); }
901template <> [[gnu::always_inline]] nce uint64x1_t convert_round_to_nearest_with_ties_to_even(float64x1_t a) { return vcvtn_u64_f64(a); }
902template <> [[gnu::always_inline]] nce int64x1_t convert_round_toward_negative_infinity(float64x1_t a) { return vcvtm_s64_f64(a); }
903template <> [[gnu::always_inline]] nce uint64x1_t convert_round_toward_negative_infinity(float64x1_t a) { return vcvtm_u64_f64(a); }
904template <> [[gnu::always_inline]] nce int64x1_t convert_round_toward_positive_infinity(float64x1_t a) { return vcvtp_s64_f64(a); }
905template <> [[gnu::always_inline]] nce uint64x1_t convert_round_toward_positive_infinity(float64x1_t a) { return vcvtp_u64_f64(a); }
906template <> [[gnu::always_inline]] nce int64x1_t convert_round_to_nearest_with_ties_away_from_zero(float64x1_t a) { return vcvta_s64_f64(a); }
907template <> [[gnu::always_inline]] nce uint64x1_t convert_round_to_nearest_with_ties_away_from_zero(float64x1_t a) { return vcvta_u64_f64(a); }
908template <int n>[[gnu::always_inline]] nce int64x1_t convert(float64x1_t a) { return vcvt_n_s64_f64(a, n); }
909template <int n>[[gnu::always_inline]] nce uint64x1_t convert(float64x1_t a) { return vcvt_n_u64_f64(a, n); }
910template <> [[gnu::always_inline]] nce poly64x1_t reinterpret(float64x1_t a) { return vreinterpret_p64_f64(a); }
911template <> [[gnu::always_inline]] nce int8x8_t reinterpret(float64x1_t a) { return vreinterpret_s8_f64(a); }
912template <> [[gnu::always_inline]] nce int16x4_t reinterpret(float64x1_t a) { return vreinterpret_s16_f64(a); }
913template <> [[gnu::always_inline]] nce int32x2_t reinterpret(float64x1_t a) { return vreinterpret_s32_f64(a); }
914template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(float64x1_t a) { return vreinterpret_u8_f64(a); }
915template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(float64x1_t a) { return vreinterpret_u16_f64(a); }
916template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(float64x1_t a) { return vreinterpret_u32_f64(a); }
917template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(float64x1_t a) { return vreinterpret_p8_f64(a); }
918template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(float64x1_t a) { return vreinterpret_p16_f64(a); }
919template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(float64x1_t a) { return vreinterpret_u64_f64(a); }
920template <> [[gnu::always_inline]] nce int64x1_t reinterpret(float64x1_t a) { return vreinterpret_s64_f64(a); }
921template <> [[gnu::always_inline]] nce float16x4_t reinterpret(float64x1_t a) { return vreinterpret_f16_f64(a); }
922template <> [[gnu::always_inline]] nce float32x2_t reinterpret(float64x1_t a) { return vreinterpret_f32_f64(a); }
923template <int lane>[[gnu::always_inline]] nce float64x1_t multiply_lane(float64x1_t a, float64x1_t v) { return vmul_lane_f64(a, v, lane); }
924[[gnu::always_inline]] nce float64x1_t multiply_add_fused(float64x1_t a, float64x1_t b, float64_t n) { return vfma_n_f64(a, b, n); }
925[[gnu::always_inline]] nce float64x1_t multiply_subtract_fused(float64x1_t a, float64x1_t b, float64_t n) { return vfms_n_f64(a, b, n); }
926template <int lane>[[gnu::always_inline]] nce float64x1_t multiply_lane(float64x1_t a, float64x2_t v) { return vmul_laneq_f64(a, v, lane); }
927[[gnu::always_inline]] nce float64x1_t multiply(float64x1_t a, float64_t b) { return vmul_n_f64(a, b); }
928[[gnu::always_inline]] nce float64x1_t negate(float64x1_t a) { return vneg_f64(a); }
929template <int lane>[[gnu::always_inline]] nce float64x1_t duplicate_lane(float64x1_t vec) { return vdup_lane_f64(vec, lane); }
930template <int lane>[[gnu::always_inline]] nce float64x2_t duplicate_lane_quad(float64x1_t vec) { return vdupq_lane_f64(vec, lane); }
931[[gnu::always_inline]] nce float64x2_t combine(float64x1_t low, float64x1_t high) { return vcombine_f64(low, high); }
932template <int lane>[[gnu::always_inline]] nce float64_t duplicate_lane(float64x1_t vec) { return vdupd_lane_f64(vec, lane); }
933template <int lane>[[gnu::always_inline]] nce float64_t get_lane(float64x1_t v) { return vget_lane_f64(v, lane); }
934template <int n>[[gnu::always_inline]] nce float64x1_t extract(float64x1_t a, float64x1_t b) { return vext_f64(a, b, n); }
935template <int lane1, int lane2>[[gnu::always_inline]] nce float64x1_t copy_lane(float64x1_t a, float64x1_t b) { return vcopy_lane_f64(a, lane1, b, lane2); }
936template <int lane1, int lane2>[[gnu::always_inline]] nce float64x1_t copy_lane(float64x1_t a, float64x2_t b) { return vcopy_laneq_f64(a, lane1, b, lane2); }
937[[gnu::always_inline]] nce float64x1_t round_to_32bit_integer(float64x1_t a) { return vrnd32z_f64(a); }
938[[gnu::always_inline]] nce float64x1_t round_to_64bit_integer(float64x1_t a) { return vrnd64z_f64(a); }
939[[gnu::always_inline]] nce float64x1_t round_to_32bit_integer_using_current_mode(float64x1_t a) { return vrnd32x_f64(a); }
940[[gnu::always_inline]] nce float64x1_t round_to_64bit_integer_using_current_mode(float64x1_t a) { return vrnd64x_f64(a); }
941template <> [[gnu::always_inline]] nce bfloat16x4_t reinterpret(float64x1_t a) { return vreinterpret_bf16_f64(a); }
942template <int lane>[[gnu::always_inline]] nce float64x2_t multiply_extended_lane(float64x2_t a, float64x1_t v) { return vmulxq_lane_f64(a, v, lane); }
943[[gnu::always_inline]] nce float64x2_t add(float64x2_t a, float64x2_t b) { return vaddq_f64(a, b); }
944[[gnu::always_inline]] nce float64x2_t multiply(float64x2_t a, float64x2_t b) { return vmulq_f64(a, b); }
945[[gnu::always_inline]] nce float64x2_t multiply_extended(float64x2_t a, float64x2_t b) { return vmulxq_f64(a, b); }
946template <int lane>[[gnu::always_inline]] nce float64x2_t multiply_extended_lane(float64x2_t a, float64x2_t v) { return vmulxq_laneq_f64(a, v, lane); }
947template <int lane>[[gnu::always_inline]] nce float64x2_t multiply_add_fused_lane(float64x2_t a, float64x2_t b, float64x1_t v) { return vfmaq_lane_f64(a, b, v, lane); }
948template <int lane>[[gnu::always_inline]] nce float64x2_t multiply_subtract_fused_lane(float64x2_t a, float64x2_t b, float64x1_t v) { return vfmsq_lane_f64(a, b, v, lane); }
949[[gnu::always_inline]] nce float64x2_t multiply_add(float64x2_t a, float64x2_t b, float64x2_t c) { return vmlaq_f64(a, b, c); }
950[[gnu::always_inline]] nce float64x2_t multiply_subtract(float64x2_t a, float64x2_t b, float64x2_t c) { return vmlsq_f64(a, b, c); }
951[[gnu::always_inline]] nce float64x2_t multiply_add_fused(float64x2_t a, float64x2_t b, float64x2_t c) { return vfmaq_f64(a, b, c); }
952template <int lane>[[gnu::always_inline]] nce float64x2_t multiply_add_fused_lane(float64x2_t a, float64x2_t b, float64x2_t v) { return vfmaq_laneq_f64(a, b, v, lane); }
953[[gnu::always_inline]] nce float64x2_t multiply_subtract_fused(float64x2_t a, float64x2_t b, float64x2_t c) { return vfmsq_f64(a, b, c); }
954template <int lane>[[gnu::always_inline]] nce float64x2_t multiply_subtract_fused_lane(float64x2_t a, float64x2_t b, float64x2_t v) { return vfmsq_laneq_f64(a, b, v, lane); }
955[[gnu::always_inline]] nce float64x2_t divide(float64x2_t a, float64x2_t b) { return vdivq_f64(a, b); }
956[[gnu::always_inline]] nce float64x2_t subtract(float64x2_t a, float64x2_t b) { return vsubq_f64(a, b); }
957[[gnu::always_inline]] nce float64x2_t subtract_absolute(float64x2_t a, float64x2_t b) { return vabdq_f64(a, b); }
958[[gnu::always_inline]] nce float64x2_t absolute(float64x2_t a) { return vabsq_f64(a); }
959template <> [[gnu::always_inline]] nce float64x2_t max(float64x2_t a, float64x2_t b) { return vmaxq_f64(a, b); }
960template <> [[gnu::always_inline]] nce float64x2_t min(float64x2_t a, float64x2_t b) { return vminq_f64(a, b); }
961template <> [[gnu::always_inline]] nce float64x2_t max_strict(float64x2_t a, float64x2_t b) { return vmaxnmq_f64(a, b); }
962template <> [[gnu::always_inline]] nce float64x2_t min_strict(float64x2_t a, float64x2_t b) { return vminnmq_f64(a, b); }
963[[gnu::always_inline]] nce float64x2_t round(float64x2_t a) { return vrndq_f64(a); }
964[[gnu::always_inline]] nce float64x2_t round_toward_negative_infinity(float64x2_t a) { return vrndmq_f64(a); }
965[[gnu::always_inline]] nce float64x2_t round_toward_positive_infinity(float64x2_t a) { return vrndpq_f64(a); }
966[[gnu::always_inline]] nce float64x2_t round_to_nearest_with_ties_away_from_zero(float64x2_t a) { return vrndaq_f64(a); }
967[[gnu::always_inline]] nce float64x2_t round_using_current_mode(float64x2_t a) { return vrndiq_f64(a); }
968[[gnu::always_inline]] nce float64x2_t round_inexact(float64x2_t a) { return vrndxq_f64(a); }
969[[gnu::always_inline]] nce float64x2_t reciprocal_estimate(float64x2_t a) { return vrecpeq_f64(a); }
970[[gnu::always_inline]] nce float64x2_t reciprocal_step(float64x2_t a, float64x2_t b) { return vrecpsq_f64(a, b); }
971[[gnu::always_inline]] nce float64x2_t reciprocal_sqrt_estimate(float64x2_t a) { return vrsqrteq_f64(a); }
972[[gnu::always_inline]] nce float64x2_t reciprocal_sqrt_step(float64x2_t a, float64x2_t b) { return vrsqrtsq_f64(a, b); }
973[[gnu::always_inline]] nce float64x2_t square_root(float64x2_t a) { return vsqrtq_f64(a); }
974[[gnu::always_inline]] nce float64x2_t pairwise_add(float64x2_t a, float64x2_t b) { return vpaddq_f64(a, b); }
975[[gnu::always_inline]] nce float64_t pairwise_add(float64x2_t a) { return vpaddd_f64(a); }
976[[gnu::always_inline]] nce float64x2_t pairwise_max(float64x2_t a, float64x2_t b) { return vpmaxq_f64(a, b); }
977[[gnu::always_inline]] nce float64_t pairwise_max(float64x2_t a) { return vpmaxqd_f64(a); }
978[[gnu::always_inline]] nce float64_t pairwise_max_strict(float64x2_t a) { return vpmaxnmqd_f64(a); }
979[[gnu::always_inline]] nce float64x2_t pairwise_min(float64x2_t a, float64x2_t b) { return vpminq_f64(a, b); }
980[[gnu::always_inline]] nce float64x2_t pairwise_max_strict(float64x2_t a, float64x2_t b) { return vpmaxnmq_f64(a, b); }
981[[gnu::always_inline]] nce float64x2_t pairwise_min_strict(float64x2_t a, float64x2_t b) { return vpminnmq_f64(a, b); }
982[[gnu::always_inline]] nce float64_t pairwise_min(float64x2_t a) { return vpminqd_f64(a); }
983[[gnu::always_inline]] nce float64_t pairwise_min_strict(float64x2_t a) { return vpminnmqd_f64(a); }
984[[gnu::always_inline]] nce float64_t reduce_add(float64x2_t a) { return vaddvq_f64(a); }
985[[gnu::always_inline]] inline float64_t reduce_max(float64x2_t a) { return vmaxvq_f64(a); }
986[[gnu::always_inline]] inline float64_t reduce_min(float64x2_t a) { return vminvq_f64(a); }
987[[gnu::always_inline]] inline float64_t reduce_max_strict(float64x2_t a) { return vmaxnmvq_f64(a); }
988[[gnu::always_inline]] inline float64_t reduce_min_strict(float64x2_t a) { return vminnmvq_f64(a); }
989[[gnu::always_inline]] nce uint64x2_t equal(float64x2_t a, float64x2_t b) { return vceqq_f64(a, b); }
990[[gnu::always_inline]] nce uint64x2_t equal_to_zero(float64x2_t a) { return vceqzq_f64(a); }
991[[gnu::always_inline]] nce uint64x2_t greater_than_or_equal(float64x2_t a, float64x2_t b) { return vcgeq_f64(a, b); }
992[[gnu::always_inline]] nce uint64x2_t greater_than_or_equal_to_zero(float64x2_t a) { return vcgezq_f64(a); }
993[[gnu::always_inline]] nce uint64x2_t less_than_or_equal(float64x2_t a, float64x2_t b) { return vcleq_f64(a, b); }
994[[gnu::always_inline]] nce uint64x2_t less_than_or_equal_to_zero(float64x2_t a) { return vclezq_f64(a); }
995[[gnu::always_inline]] nce uint64x2_t greater_than(float64x2_t a, float64x2_t b) { return vcgtq_f64(a, b); }
996[[gnu::always_inline]] nce uint64x2_t greater_than_zero(float64x2_t a) { return vcgtzq_f64(a); }
997[[gnu::always_inline]] nce uint64x2_t less_than(float64x2_t a, float64x2_t b) { return vcltq_f64(a, b); }
998[[gnu::always_inline]] nce uint64x2_t less_than_zero(float64x2_t a) { return vcltzq_f64(a); }
999[[gnu::always_inline]] nce uint64x2_t absolute_greater_than_or_equal(float64x2_t a, float64x2_t b) { return vcageq_f64(a, b); }
1000[[gnu::always_inline]] nce uint64x2_t absolute_less_than_or_equal(float64x2_t a, float64x2_t b) { return vcaleq_f64(a, b); }
1001[[gnu::always_inline]] nce uint64x2_t absolute_greater_than(float64x2_t a, float64x2_t b) { return vcagtq_f64(a, b); }
1002[[gnu::always_inline]] nce uint64x2_t absolute_less_than(float64x2_t a, float64x2_t b) { return vcaltq_f64(a, b); }
1003template <> [[gnu::always_inline]] nce int64x2_t convert(float64x2_t a) { return vcvtq_s64_f64(a); }
1004template <> [[gnu::always_inline]] nce uint64x2_t convert(float64x2_t a) { return vcvtq_u64_f64(a); }
1005template <> [[gnu::always_inline]] nce int64x2_t convert_round_to_nearest_with_ties_to_even(float64x2_t a) { return vcvtnq_s64_f64(a); }
1006template <> [[gnu::always_inline]] nce uint64x2_t convert_round_to_nearest_with_ties_to_even(float64x2_t a) { return vcvtnq_u64_f64(a); }
1007template <> [[gnu::always_inline]] nce int64x2_t convert_round_toward_negative_infinity(float64x2_t a) { return vcvtmq_s64_f64(a); }
1008template <> [[gnu::always_inline]] nce uint64x2_t convert_round_toward_negative_infinity(float64x2_t a) { return vcvtmq_u64_f64(a); }
1009template <> [[gnu::always_inline]] nce int64x2_t convert_round_toward_positive_infinity(float64x2_t a) { return vcvtpq_s64_f64(a); }
1010template <> [[gnu::always_inline]] nce uint64x2_t convert_round_toward_positive_infinity(float64x2_t a) { return vcvtpq_u64_f64(a); }
1011template <> [[gnu::always_inline]] nce int64x2_t convert_round_to_nearest_with_ties_away_from_zero(float64x2_t a) { return vcvtaq_s64_f64(a); }
1012template <> [[gnu::always_inline]] nce uint64x2_t convert_round_to_nearest_with_ties_away_from_zero(float64x2_t a) { return vcvtaq_u64_f64(a); }
1013template <int n>[[gnu::always_inline]] nce int64x2_t convert(float64x2_t a) { return vcvtq_n_s64_f64(a, n); }
1014template <int n>[[gnu::always_inline]] nce uint64x2_t convert(float64x2_t a) { return vcvtq_n_u64_f64(a, n); }
1015template <> [[gnu::always_inline]] nce float32x2_t convert(float64x2_t a) { return vcvt_f32_f64(a); }
1016[[gnu::always_inline]] nce float32x2_t convert_round_to_odd(float64x2_t a) { return vcvtx_f32_f64(a); }
1017template <> [[gnu::always_inline]] nce poly64x2_t reinterpret(float64x2_t a) { return vreinterpretq_p64_f64(a); }
1018template <> [[gnu::always_inline]] nce poly128_t reinterpret(float64x2_t a) { return vreinterpretq_p128_f64(a); }
1019template <> [[gnu::always_inline]] nce int8x16_t reinterpret(float64x2_t a) { return vreinterpretq_s8_f64(a); }
1020template <> [[gnu::always_inline]] nce int16x8_t reinterpret(float64x2_t a) { return vreinterpretq_s16_f64(a); }
1021template <> [[gnu::always_inline]] nce int32x4_t reinterpret(float64x2_t a) { return vreinterpretq_s32_f64(a); }
1022template <> [[gnu::always_inline]] nce uint8x16_t reinterpret(float64x2_t a) { return vreinterpretq_u8_f64(a); }
1023template <> [[gnu::always_inline]] nce uint16x8_t reinterpret(float64x2_t a) { return vreinterpretq_u16_f64(a); }
1024template <> [[gnu::always_inline]] nce uint32x4_t reinterpret(float64x2_t a) { return vreinterpretq_u32_f64(a); }
1025template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(float64x2_t a) { return vreinterpretq_p8_f64(a); }
1026template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(float64x2_t a) { return vreinterpretq_p16_f64(a); }
1027template <> [[gnu::always_inline]] nce uint64x2_t reinterpret(float64x2_t a) { return vreinterpretq_u64_f64(a); }
1028template <> [[gnu::always_inline]] nce int64x2_t reinterpret(float64x2_t a) { return vreinterpretq_s64_f64(a); }
1029template <> [[gnu::always_inline]] nce float16x8_t reinterpret(float64x2_t a) { return vreinterpretq_f16_f64(a); }
1030template <> [[gnu::always_inline]] nce float32x4_t reinterpret(float64x2_t a) { return vreinterpretq_f32_f64(a); }
1031template <int lane>[[gnu::always_inline]] nce float64x2_t multiply_lane(float64x2_t a, float64x1_t v) { return vmulq_lane_f64(a, v, lane); }
1032template <int lane>[[gnu::always_inline]] nce float64x2_t multiply_lane(float64x2_t a, float64x2_t v) { return vmulq_laneq_f64(a, v, lane); }
1033[[gnu::always_inline]] nce float64x2_t multiply_add_fused(float64x2_t a, float64x2_t b, float64_t n) { return vfmaq_n_f64(a, b, n); }
1034[[gnu::always_inline]] nce float64x2_t multiply_subtract_fused(float64x2_t a, float64x2_t b, float64_t n) { return vfmsq_n_f64(a, b, n); }
1035[[gnu::always_inline]] nce float64x2_t multiply(float64x2_t a, float64_t b) { return vmulq_n_f64(a, b); }
1036[[gnu::always_inline]] nce float64x2_t negate(float64x2_t a) { return vnegq_f64(a); }
1037template <int lane>[[gnu::always_inline]] nce float64x1_t duplicate_lane(float64x2_t vec) { return vdup_laneq_f64(vec, lane); }
1038template <int lane>[[gnu::always_inline]] nce float64x2_t duplicate_lane_quad(float64x2_t vec) { return vdupq_laneq_f64(vec, lane); }
1039template <> [[gnu::always_inline]] nce float64x1_t get_high(float64x2_t a) { return vget_high_f64(a); }
1040template <> [[gnu::always_inline]] nce float64x1_t get_low(float64x2_t a) { return vget_low_f64(a); }
1041template <int lane>[[gnu::always_inline]] nce float64_t duplicate_lane(float64x2_t vec) { return vdupd_laneq_f64(vec, lane); }
1042template <int lane>[[gnu::always_inline]] nce float64_t get_lane(float64x2_t v) { return vgetq_lane_f64(v, lane); }
1043template <int n>[[gnu::always_inline]] nce float64x2_t extract(float64x2_t a, float64x2_t b) { return vextq_f64(a, b, n); }
1044[[gnu::always_inline]] nce float64x2_t zip1(float64x2_t a, float64x2_t b) { return vzip1q_f64(a, b); }
1045[[gnu::always_inline]] nce float64x2_t zip2(float64x2_t a, float64x2_t b) { return vzip2q_f64(a, b); }
1046[[gnu::always_inline]] nce float64x2_t unzip1(float64x2_t a, float64x2_t b) { return vuzp1q_f64(a, b); }
1047[[gnu::always_inline]] nce float64x2_t unzip2(float64x2_t a, float64x2_t b) { return vuzp2q_f64(a, b); }
1048[[gnu::always_inline]] nce float64x2_t transpose_step_1(float64x2_t a, float64x2_t b) { return vtrn1q_f64(a, b); }
1049[[gnu::always_inline]] nce float64x2_t transpose_step_2(float64x2_t a, float64x2_t b) { return vtrn2q_f64(a, b); }
1050[[gnu::always_inline]] nce float64x2_t complex_add_rotate_90(float64x2_t a, float64x2_t b) { return vcaddq_rot90_f64(a, b); }
1051[[gnu::always_inline]] nce float64x2_t complex_add_rotate_270(float64x2_t a, float64x2_t b) { return vcaddq_rot270_f64(a, b); }
1052[[gnu::always_inline]] nce float64x2_t complex_multiply_add(float64x2_t r, float64x2_t a, float64x2_t b) { return vcmlaq_f64(r, a, b); }
1053[[gnu::always_inline]] nce float64x2_t complex_multiply_add_rotate_90(float64x2_t r, float64x2_t a, float64x2_t b) { return vcmlaq_rot90_f64(r, a, b); }
1054[[gnu::always_inline]] nce float64x2_t complex_multiply_add_rotate_180(float64x2_t r, float64x2_t a, float64x2_t b) { return vcmlaq_rot180_f64(r, a, b); }
1055[[gnu::always_inline]] nce float64x2_t complex_multiply_add_rotate_270(float64x2_t r, float64x2_t a, float64x2_t b) { return vcmlaq_rot270_f64(r, a, b); }
1056template <int lane1, int lane2>[[gnu::always_inline]] nce float64x2_t copy_lane(float64x2_t a, float64x1_t b) { return vcopyq_lane_f64(a, lane1, b, lane2); }
1057template <int lane1, int lane2>[[gnu::always_inline]] nce float64x2_t copy_lane(float64x2_t a, float64x2_t b) { return vcopyq_laneq_f64(a, lane1, b, lane2); }
1058[[gnu::always_inline]] nce float64x2_t round_to_32bit_integer(float64x2_t a) { return vrnd32zq_f64(a); }
1059[[gnu::always_inline]] nce float64x2_t round_to_64bit_integer(float64x2_t a) { return vrnd64zq_f64(a); }
1060[[gnu::always_inline]] nce float64x2_t round_to_32bit_integer_using_current_mode(float64x2_t a) { return vrnd32xq_f64(a); }
1061[[gnu::always_inline]] nce float64x2_t round_to_64bit_integer_using_current_mode(float64x2_t a) { return vrnd64xq_f64(a); }
1062template <> [[gnu::always_inline]] nce bfloat16x8_t reinterpret(float64x2_t a) { return vreinterpretq_bf16_f64(a); }
1063[[gnu::always_inline]] nce uint8x8_t equal_to_zero(poly8x8_t a) { return vceqz_p8(a); }
1064[[gnu::always_inline]] nce float64x1_t reinterpret(poly8x8_t a) { return vreinterpret_f64_p8(a); }
1065[[gnu::always_inline]] nce poly8x8_t reverse_bits(poly8x8_t a) { return vrbit_p8(a); }
1066[[gnu::always_inline]] nce poly8x8_t zip1(poly8x8_t a, poly8x8_t b) { return vzip1_p8(a, b); }
1067[[gnu::always_inline]] nce poly8x8_t zip2(poly8x8_t a, poly8x8_t b) { return vzip2_p8(a, b); }
1068[[gnu::always_inline]] nce poly8x8_t unzip1(poly8x8_t a, poly8x8_t b) { return vuzp1_p8(a, b); }
1069[[gnu::always_inline]] nce poly8x8_t unzip2(poly8x8_t a, poly8x8_t b) { return vuzp2_p8(a, b); }
1070[[gnu::always_inline]] nce poly8x8_t transpose_step_1(poly8x8_t a, poly8x8_t b) { return vtrn1_p8(a, b); }
1071[[gnu::always_inline]] nce poly8x8_t transpose_step_2(poly8x8_t a, poly8x8_t b) { return vtrn2_p8(a, b); }
1072template <int lane1, int lane2>[[gnu::always_inline]] nce poly8x8_t copy_lane(poly8x8_t a, poly8x8_t b) { return vcopy_lane_p8(a, lane1, b, lane2); }
1073template <int lane1, int lane2>[[gnu::always_inline]] nce poly8x8_t copy_lane(poly8x8_t a, poly8x16_t b) { return vcopy_laneq_p8(a, lane1, b, lane2); }
1074[[gnu::always_inline]] nce poly8x8_t table_extend1_saturate(poly8x8_t a, poly8x16_t t, uint8x8_t idx) { return vqtbx1_p8(a, t, idx); }
1075[[gnu::always_inline]] nce poly8x8_t table_extend2_saturate(poly8x8_t a, poly8x16x2_t t, uint8x8_t idx) { return vqtbx2_p8(a, t, idx); }
1076[[gnu::always_inline]] nce poly8x8_t table_extend3_saturate(poly8x8_t a, poly8x16x3_t t, uint8x8_t idx) { return vqtbx3_p8(a, t, idx); }
1077[[gnu::always_inline]] nce poly8x8_t table_extend4_saturate(poly8x8_t a, poly8x16x4_t t, uint8x8_t idx) { return vqtbx4_p8(a, t, idx); }
1078template <> [[gnu::always_inline]] nce float64x1_t reinterpret(poly16x4_t a) { return vreinterpret_f64_p16(a); }
1079[[gnu::always_inline]] nce poly16x4_t zip1(poly16x4_t a, poly16x4_t b) { return vzip1_p16(a, b); }
1080[[gnu::always_inline]] nce poly16x4_t zip2(poly16x4_t a, poly16x4_t b) { return vzip2_p16(a, b); }
1081[[gnu::always_inline]] nce poly16x4_t unzip1(poly16x4_t a, poly16x4_t b) { return vuzp1_p16(a, b); }
1082[[gnu::always_inline]] nce poly16x4_t unzip2(poly16x4_t a, poly16x4_t b) { return vuzp2_p16(a, b); }
1083[[gnu::always_inline]] nce poly16x4_t transpose_step_1(poly16x4_t a, poly16x4_t b) { return vtrn1_p16(a, b); }
1084[[gnu::always_inline]] nce poly16x4_t transpose_step_2(poly16x4_t a, poly16x4_t b) { return vtrn2_p16(a, b); }
1085template <int lane1, int lane2>[[gnu::always_inline]] nce poly16x4_t copy_lane(poly16x4_t a, poly16x4_t b) { return vcopy_lane_p16(a, lane1, b, lane2); }
1086template <int lane1, int lane2>[[gnu::always_inline]] nce poly16x4_t copy_lane(poly16x4_t a, poly16x8_t b) { return vcopy_laneq_p16(a, lane1, b, lane2); }
1087[[gnu::always_inline]] nce int64_t add(int64_t a, int64_t b) { return vaddd_s64(a, b); }
1088[[gnu::always_inline]] nce uint64_t add(uint64_t a, uint64_t b) { return vaddd_u64(a, b); }
1089[[gnu::always_inline]] nce int64x2_t add_high(int64x2_t a, int32x4_t b) { return vaddw_high_s32(a, b); }
1090[[gnu::always_inline]] nce int8_t add_saturate(int8_t a, int8_t b) { return vqaddb_s8(a, b); }
1091[[gnu::always_inline]] nce int16_t add_saturate(int16_t a, int16_t b) { return vqaddh_s16(a, b); }
1092[[gnu::always_inline]] nce int32_t add_saturate(int32_t a, int32_t b) { return vqadds_s32(a, b); }
1093[[gnu::always_inline]] nce int64_t add_saturate(int64_t a, int64_t b) { return vqaddd_s64(a, b); }
1094[[gnu::always_inline]] nce uint8_t add_saturate(uint8_t a, uint8_t b) { return vqaddb_u8(a, b); }
1095[[gnu::always_inline]] nce uint16_t add_saturate(uint16_t a, uint16_t b) { return vqaddh_u16(a, b); }
1096[[gnu::always_inline]] nce uint32_t add_saturate(uint32_t a, uint32_t b) { return vqadds_u32(a, b); }
1097[[gnu::always_inline]] nce uint64_t add_saturate(uint64_t a, uint64_t b) { return vqaddd_u64(a, b); }
1098[[gnu::always_inline]] nce int64x1_t add_saturate(int64x1_t a, uint64x1_t b) { return vuqadd_s64(a, b); }
1099[[gnu::always_inline]] nce int64x2_t add_saturate(int64x2_t a, uint64x2_t b) { return vuqaddq_s64(a, b); }
1100[[gnu::always_inline]] nce int8_t add_saturate(int8_t a, uint8_t b) { return vuqaddb_s8(a, b); }
1101[[gnu::always_inline]] nce int16_t add_saturate(int16_t a, uint16_t b) { return vuqaddh_s16(a, b); }
1102[[gnu::always_inline]] nce int32_t add_saturate(int32_t a, uint32_t b) { return vuqadds_s32(a, b); }
1103[[gnu::always_inline]] nce int64_t add_saturate(int64_t a, uint64_t b) { return vuqaddd_s64(a, b); }
1104[[gnu::always_inline]] nce uint8_t add_saturate(uint8_t a, int8_t b) { return vsqaddb_u8(a, b); }
1105[[gnu::always_inline]] nce uint16_t add_saturate(uint16_t a, int16_t b) { return vsqaddh_u16(a, b); }
1106[[gnu::always_inline]] nce uint32_t add_saturate(uint32_t a, int32_t b) { return vsqadds_u32(a, b); }
1107[[gnu::always_inline]] nce uint64_t add_saturate(uint64_t a, int64_t b) { return vsqaddd_u64(a, b); }
1108[[gnu::always_inline]] nce float32_t multiply_extended(float32_t a, float32_t b) { return vmulxs_f32(a, b); }
1109[[gnu::always_inline]] nce float64_t multiply_extended(float64_t a, float64_t b) { return vmulxd_f64(a, b); }
1110template <int lane>[[gnu::always_inline]] nce float32_t multiply_extended_lane(float32_t a, float32x2_t v) { return vmulxs_lane_f32(a, v, lane); }
1111template <int lane>[[gnu::always_inline]] nce float64_t multiply_extended_lane(float64_t a, float64x1_t v) { return vmulxd_lane_f64(a, v, lane); }
1112template <int lane>[[gnu::always_inline]] nce float32_t multiply_extended_lane(float32_t a, float32x4_t v) { return vmulxs_laneq_f32(a, v, lane); }
1113template <int lane>[[gnu::always_inline]] nce float64_t multiply_extended_lane(float64_t a, float64x2_t v) { return vmulxd_laneq_f64(a, v, lane); }
1114[[gnu::always_inline]] nce int64x2_t multiply_add_long_high(int64x2_t a, int32x4_t b, int32x4_t c) { return vmlal_high_s32(a, b, c); }
1115[[gnu::always_inline]] nce int64x2_t multiply_subtract_long_high(int64x2_t a, int32x4_t b, int32x4_t c) { return vmlsl_high_s32(a, b, c); }
1116template <int lane>[[gnu::always_inline]] nce float32_t multiply_add_fused_lane(float32_t a, float32_t b, float32x2_t v) { return vfmas_lane_f32(a, b, v, lane); }
1117template <int lane>[[gnu::always_inline]] nce float64_t multiply_add_fused_lane(float64_t a, float64_t b, float64x1_t v) { return vfmad_lane_f64(a, b, v, lane); }
1118template <int lane>[[gnu::always_inline]] nce float32_t multiply_add_fused_lane(float32_t a, float32_t b, float32x4_t v) { return vfmas_laneq_f32(a, b, v, lane); }
1119template <int lane>[[gnu::always_inline]] nce float64_t multiply_add_fused_lane(float64_t a, float64_t b, float64x2_t v) { return vfmad_laneq_f64(a, b, v, lane); }
1120template <int lane>[[gnu::always_inline]] nce float32_t multiply_subtract_fused_lane(float32_t a, float32_t b, float32x2_t v) { return vfmss_lane_f32(a, b, v, lane); }
1121template <int lane>[[gnu::always_inline]] nce float64_t multiply_subtract_fused_lane(float64_t a, float64_t b, float64x1_t v) { return vfmsd_lane_f64(a, b, v, lane); }
1122template <int lane>[[gnu::always_inline]] nce float32_t multiply_subtract_fused_lane(float32_t a, float32_t b, float32x4_t v) { return vfmss_laneq_f32(a, b, v, lane); }
1123template <int lane>[[gnu::always_inline]] nce float64_t multiply_subtract_fused_lane(float64_t a, float64_t b, float64x2_t v) { return vfmsd_laneq_f64(a, b, v, lane); }
1124[[gnu::always_inline]] nce int16_t multiply_double_saturate_high(int16_t a, int16_t b) { return vqdmulhh_s16(a, b); }
1125[[gnu::always_inline]] nce int32_t multiply_double_saturate_high(int32_t a, int32_t b) { return vqdmulhs_s32(a, b); }
1126[[gnu::always_inline]] nce int16_t multiply_double_round_saturate_high(int16_t a, int16_t b) { return vqrdmulhh_s16(a, b); }
1127[[gnu::always_inline]] nce int32_t multiply_double_round_saturate_high(int32_t a, int32_t b) { return vqrdmulhs_s32(a, b); }
1128[[gnu::always_inline]] nce int32_t multiply_double_saturate_long(int16_t a, int16_t b) { return vqdmullh_s16(a, b); }
1129[[gnu::always_inline]] nce int64_t multiply_double_saturate_long(int32_t a, int32_t b) { return vqdmulls_s32(a, b); }
1130[[gnu::always_inline]] nce int32_t multiply_double_add_saturate_long(int32_t a, int16_t b, int16_t c) { return vqdmlalh_s16(a, b, c); }
1131[[gnu::always_inline]] nce int64_t multiply_double_add_saturate_long(int64_t a, int32_t b, int32_t c) { return vqdmlals_s32(a, b, c); }
1132[[gnu::always_inline]] nce int64x2_t multiply_double_add_saturate_long_high(int64x2_t a, int32x4_t b, int32x4_t c) { return vqdmlal_high_s32(a, b, c); }
1133[[gnu::always_inline]] nce int32_t multiply_double_subtract_saturate_long(int32_t a, int16_t b, int16_t c) { return vqdmlslh_s16(a, b, c); }
1134[[gnu::always_inline]] nce int64_t multiply_double_subtract_saturate_long(int64_t a, int32_t b, int32_t c) { return vqdmlsls_s32(a, b, c); }
1135[[gnu::always_inline]] nce int64x2_t multiply_double_subtract_saturate_long_high(int64x2_t a, int32x4_t b, int32x4_t c) { return vqdmlsl_high_s32(a, b, c); }
1136template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_add_saturate_long_lane(int32_t a, int16_t b, int16x4_t v) { return vqdmlalh_lane_s16(a, b, v, lane); }
1137template <int lane>[[gnu::always_inline]] nce int64_t multiply_double_add_saturate_long_lane(int64_t a, int32_t b, int32x2_t v) { return vqdmlals_lane_s32(a, b, v, lane); }
1138template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_double_add_saturate_long_lane_high(int64x2_t a, int32x4_t b, int32x2_t v) { return vqdmlal_high_lane_s32(a, b, v, lane); }
1139template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_double_add_saturate_long_lane(int64x2_t a, int32x2_t b, int32x4_t v) { return vqdmlal_laneq_s32(a, b, v, lane); }
1140template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_add_saturate_long_lane(int32_t a, int16_t b, int16x8_t v) { return vqdmlalh_laneq_s16(a, b, v, lane); }
1141template <int lane>[[gnu::always_inline]] nce int64_t multiply_double_add_saturate_long_lane(int64_t a, int32_t b, int32x4_t v) { return vqdmlals_laneq_s32(a, b, v, lane); }
1142template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_double_add_saturate_long_lane_high(int64x2_t a, int32x4_t b, int32x4_t v) { return vqdmlal_high_laneq_s32(a, b, v, lane); }
1143template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_subtract_saturate_long_lane(int32_t a, int16_t b, int16x4_t v) { return vqdmlslh_lane_s16(a, b, v, lane); }
1144template <int lane>[[gnu::always_inline]] nce int64_t multiply_double_subtract_saturate_long_lane(int64_t a, int32_t b, int32x2_t v) { return vqdmlsls_lane_s32(a, b, v, lane); }
1145template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_double_subtract_saturate_long_lane_high(int64x2_t a, int32x4_t b, int32x2_t v) { return vqdmlsl_high_lane_s32(a, b, v, lane); }
1146template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_double_subtract_saturate_long_lane(int64x2_t a, int32x2_t b, int32x4_t v) { return vqdmlsl_laneq_s32(a, b, v, lane); }
1147template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_subtract_saturate_long_lane(int32_t a, int16_t b, int16x8_t v) { return vqdmlslh_laneq_s16(a, b, v, lane); }
1148template <int lane>[[gnu::always_inline]] nce int64_t multiply_double_subtract_saturate_long_lane(int64_t a, int32_t b, int32x4_t v) { return vqdmlsls_laneq_s32(a, b, v, lane); }
1149template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_double_subtract_saturate_long_lane_high(int64x2_t a, int32x4_t b, int32x4_t v) { return vqdmlsl_high_laneq_s32(a, b, v, lane); }
1150template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_saturate_long_lane(int16_t a, int16x4_t v) { return vqdmullh_lane_s16(a, v, lane); }
1151template <int lane>[[gnu::always_inline]] nce int64_t multiply_double_saturate_long_lane(int32_t a, int32x2_t v) { return vqdmulls_lane_s32(a, v, lane); }
1152template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_saturate_long_lane(int16_t a, int16x8_t v) { return vqdmullh_laneq_s16(a, v, lane); }
1153template <int lane>[[gnu::always_inline]] nce int64_t multiply_double_saturate_long_lane(int32_t a, int32x4_t v) { return vqdmulls_laneq_s32(a, v, lane); }
1154template <int lane>[[gnu::always_inline]] nce int16_t multiply_double_saturate_high_lane(int16_t a, int16x4_t v) { return vqdmulhh_lane_s16(a, v, lane); }
1155template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_saturate_high_lane(int32_t a, int32x2_t v) { return vqdmulhs_lane_s32(a, v, lane); }
1156template <int lane>[[gnu::always_inline]] nce int16_t multiply_double_saturate_high_lane(int16_t a, int16x8_t v) { return vqdmulhh_laneq_s16(a, v, lane); }
1157template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_saturate_high_lane(int32_t a, int32x4_t v) { return vqdmulhs_laneq_s32(a, v, lane); }
1158template <int lane>[[gnu::always_inline]] nce int16_t multiply_double_round_saturate_high_lane(int16_t a, int16x4_t v) { return vqrdmulhh_lane_s16(a, v, lane); }
1159template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_round_saturate_high_lane(int32_t a, int32x2_t v) { return vqrdmulhs_lane_s32(a, v, lane); }
1160template <int lane>[[gnu::always_inline]] nce int16_t multiply_double_round_saturate_high_lane(int16_t a, int16x8_t v) { return vqrdmulhh_laneq_s16(a, v, lane); }
1161template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_round_saturate_high_lane(int32_t a, int32x4_t v) { return vqrdmulhs_laneq_s32(a, v, lane); }
1162[[gnu::always_inline]] nce int64x2_t multiply_double_add_saturate_long_high(int64x2_t a, int32x4_t b, int32_t c) { return vqdmlal_high_n_s32(a, b, c); }
1163[[gnu::always_inline]] nce int64x2_t multiply_double_subtract_saturate_long_high(int64x2_t a, int32x4_t b, int32_t c) { return vqdmlsl_high_n_s32(a, b, c); }
1164[[gnu::always_inline]] nce poly16x8_t multiply_long_high(poly8x16_t a, poly8x16_t b) { return vmull_high_p8(a, b); }
1165[[gnu::always_inline]] nce int64_t subtract(int64_t a, int64_t b) { return vsubd_s64(a, b); }
1166[[gnu::always_inline]] nce uint64_t subtract(uint64_t a, uint64_t b) { return vsubd_u64(a, b); }
1167[[gnu::always_inline]] nce int64x2_t subtract_high(int64x2_t a, int32x4_t b) { return vsubw_high_s32(a, b); }
1168[[gnu::always_inline]] nce int8_t subtract_saturate(int8_t a, int8_t b) { return vqsubb_s8(a, b); }
1169[[gnu::always_inline]] nce int16_t subtract_saturate(int16_t a, int16_t b) { return vqsubh_s16(a, b); }
1170[[gnu::always_inline]] nce int32_t subtract_saturate(int32_t a, int32_t b) { return vqsubs_s32(a, b); }
1171[[gnu::always_inline]] nce int64_t subtract_saturate(int64_t a, int64_t b) { return vqsubd_s64(a, b); }
1172[[gnu::always_inline]] nce uint8_t subtract_saturate(uint8_t a, uint8_t b) { return vqsubb_u8(a, b); }
1173[[gnu::always_inline]] nce uint16_t subtract_saturate(uint16_t a, uint16_t b) { return vqsubh_u16(a, b); }
1174[[gnu::always_inline]] nce uint32_t subtract_saturate(uint32_t a, uint32_t b) { return vqsubs_u32(a, b); }
1175[[gnu::always_inline]] nce uint64_t subtract_saturate(uint64_t a, uint64_t b) { return vqsubd_u64(a, b); }
1176[[gnu::always_inline]] nce float32_t absolute_difference(float32_t a, float32_t b) { return vabds_f32(a, b); }
1177[[gnu::always_inline]] nce float64_t absolute_difference(float64_t a, float64_t b) { return vabdd_f64(a, b); }
1178[[gnu::always_inline]] nce int64x2_t subtract_absolute_add_high(int64x2_t a, int32x4_t b, int32x4_t c) { return vabal_high_s32(a, b, c); }
1179[[gnu::always_inline]] nce int64x1_t absolute(int64x1_t a) { return vabs_s64(a); }
1180[[gnu::always_inline]] nce int64_t absolute(int64_t a) { return vabsd_s64(a); }
1181[[gnu::always_inline]] nce int64x2_t absolute(int64x2_t a) { return vabsq_s64(a); }
1182[[gnu::always_inline]] nce int64x1_t absolute_saturate(int64x1_t a) { return vqabs_s64(a); }
1183[[gnu::always_inline]] nce int64x2_t absolute_saturate(int64x2_t a) { return vqabsq_s64(a); }
1184[[gnu::always_inline]] nce int8_t absolute_saturate(int8_t a) { return vqabsb_s8(a); }
1185[[gnu::always_inline]] nce int16_t absolute_saturate(int16_t a) { return vqabsh_s16(a); }
1186[[gnu::always_inline]] nce int32_t absolute_saturate(int32_t a) { return vqabss_s32(a); }
1187[[gnu::always_inline]] nce int64_t absolute_saturate(int64_t a) { return vqabsd_s64(a); }
1188[[gnu::always_inline]] nce float32_t reciprocal_estimate(float32_t a) { return vrecpes_f32(a); }
1189[[gnu::always_inline]] nce float64_t reciprocal_estimate(float64_t a) { return vrecped_f64(a); }
1190[[gnu::always_inline]] nce float32_t reciprocal_step(float32_t a, float32_t b) { return vrecpss_f32(a, b); }
1191[[gnu::always_inline]] nce float64_t reciprocal_step(float64_t a, float64_t b) { return vrecpsd_f64(a, b); }
1192[[gnu::always_inline]] nce float32_t reciprocal_sqrt_estimate(float32_t a) { return vrsqrtes_f32(a); }
1193[[gnu::always_inline]] nce float64_t reciprocal_sqrt_estimate(float64_t a) { return vrsqrted_f64(a); }
1194[[gnu::always_inline]] nce float32_t reciprocal_sqrt_step(float32_t a, float32_t b) { return vrsqrtss_f32(a, b); }
1195[[gnu::always_inline]] nce float64_t reciprocal_sqrt_step(float64_t a, float64_t b) { return vrsqrtsd_f64(a, b); }
1196[[gnu::always_inline]] nce float32_t reciprocal_exponent(float32_t a) { return vrecpxs_f32(a); }
1197[[gnu::always_inline]] nce float64_t reciprocal_exponent(float64_t a) { return vrecpxd_f64(a); }
1198[[gnu::always_inline]] nce int64x2_t pairwise_add(int64x2_t a, int64x2_t b) { return vpaddq_s64(a, b); }
1199[[gnu::always_inline]] nce int64_t pairwise_add(int64x2_t a) { return vpaddd_s64(a); }
1200[[gnu::always_inline]] nce int64_t reduce_add(int64x2_t a) { return vaddvq_s64(a); }
1201[[gnu::always_inline]] nce uint64x1_t equal(int64x1_t a, int64x1_t b) { return vceq_s64(a, b); }
1202[[gnu::always_inline]] nce uint64x2_t equal(int64x2_t a, int64x2_t b) { return vceqq_s64(a, b); }
1203[[gnu::always_inline]] nce uint64_t equal(int64_t a, int64_t b) { return vceqd_s64(a, b); }
1204[[gnu::always_inline]] nce uint64_t equal(uint64_t a, uint64_t b) { return vceqd_u64(a, b); }
1205[[gnu::always_inline]] nce uint32_t equal(float32_t a, float32_t b) { return vceqs_f32(a, b); }
1206[[gnu::always_inline]] nce uint64_t equal(float64_t a, float64_t b) { return vceqd_f64(a, b); }
1207[[gnu::always_inline]] nce uint8x16_t equal_to_zero(poly8x16_t a) { return vceqzq_p8(a); }
1208[[gnu::always_inline]] nce uint64x1_t equal_to_zero(int64x1_t a) { return vceqz_s64(a); }
1209[[gnu::always_inline]] nce uint64x2_t equal_to_zero(int64x2_t a) { return vceqzq_s64(a); }
1210[[gnu::always_inline]] nce uint64_t equal_to_zero(int64_t a) { return vceqzd_s64(a); }
1211[[gnu::always_inline]] nce uint64_t equal_to_zero(uint64_t a) { return vceqzd_u64(a); }
1212[[gnu::always_inline]] nce uint32_t equal_to_zero(float32_t a) { return vceqzs_f32(a); }
1213[[gnu::always_inline]] nce uint64_t equal_to_zero(float64_t a) { return vceqzd_f64(a); }
1214[[gnu::always_inline]] nce uint64x1_t greater_than_or_equal(int64x1_t a, int64x1_t b) { return vcge_s64(a, b); }
1215[[gnu::always_inline]] nce uint64x2_t greater_than_or_equal(int64x2_t a, int64x2_t b) { return vcgeq_s64(a, b); }
1216[[gnu::always_inline]] nce uint64_t greater_than_or_equal(int64_t a, int64_t b) { return vcged_s64(a, b); }
1217[[gnu::always_inline]] nce uint64_t greater_than_or_equal(uint64_t a, uint64_t b) { return vcged_u64(a, b); }
1218[[gnu::always_inline]] nce uint32_t greater_than_or_equal(float32_t a, float32_t b) { return vcges_f32(a, b); }
1219[[gnu::always_inline]] nce uint64_t greater_than_or_equal(float64_t a, float64_t b) { return vcged_f64(a, b); }
1220[[gnu::always_inline]] nce uint64x1_t greater_than_or_equal_to_zero(int64x1_t a) { return vcgez_s64(a); }
1221[[gnu::always_inline]] nce uint64x2_t greater_than_or_equal_to_zero(int64x2_t a) { return vcgezq_s64(a); }
1222[[gnu::always_inline]] nce uint64_t greater_than_or_equal_to_zero(int64_t a) { return vcgezd_s64(a); }
1223[[gnu::always_inline]] nce uint32_t greater_than_or_equal_to_zero(float32_t a) { return vcgezs_f32(a); }
1224[[gnu::always_inline]] nce uint64_t greater_than_or_equal_to_zero(float64_t a) { return vcgezd_f64(a); }
1225[[gnu::always_inline]] nce uint64x1_t less_than_or_equal(int64x1_t a, int64x1_t b) { return vcle_s64(a, b); }
1226[[gnu::always_inline]] nce uint64x2_t less_than_or_equal(int64x2_t a, int64x2_t b) { return vcleq_s64(a, b); }
1227[[gnu::always_inline]] nce uint64_t less_than_or_equal(int64_t a, int64_t b) { return vcled_s64(a, b); }
1228[[gnu::always_inline]] nce uint64_t less_than_or_equal(uint64_t a, uint64_t b) { return vcled_u64(a, b); }
1229[[gnu::always_inline]] nce uint32_t less_than_or_equal(float32_t a, float32_t b) { return vcles_f32(a, b); }
1230[[gnu::always_inline]] nce uint64_t less_than_or_equal(float64_t a, float64_t b) { return vcled_f64(a, b); }
1231[[gnu::always_inline]] nce uint64x1_t less_than_or_equal_to_zero(int64x1_t a) { return vclez_s64(a); }
1232[[gnu::always_inline]] nce uint64x2_t less_than_or_equal_to_zero(int64x2_t a) { return vclezq_s64(a); }
1233[[gnu::always_inline]] nce uint64_t less_than_or_equal_to_zero(int64_t a) { return vclezd_s64(a); }
1234[[gnu::always_inline]] nce uint32_t less_than_or_equal_to_zero(float32_t a) { return vclezs_f32(a); }
1235[[gnu::always_inline]] nce uint64_t less_than_or_equal_to_zero(float64_t a) { return vclezd_f64(a); }
1236[[gnu::always_inline]] nce uint64x1_t greater_than(int64x1_t a, int64x1_t b) { return vcgt_s64(a, b); }
1237[[gnu::always_inline]] nce uint64x2_t greater_than(int64x2_t a, int64x2_t b) { return vcgtq_s64(a, b); }
1238[[gnu::always_inline]] nce uint64_t greater_than(int64_t a, int64_t b) { return vcgtd_s64(a, b); }
1239[[gnu::always_inline]] nce uint64_t greater_than(uint64_t a, uint64_t b) { return vcgtd_u64(a, b); }
1240[[gnu::always_inline]] nce uint32_t greater_than(float32_t a, float32_t b) { return vcgts_f32(a, b); }
1241[[gnu::always_inline]] nce uint64_t greater_than(float64_t a, float64_t b) { return vcgtd_f64(a, b); }
1242[[gnu::always_inline]] nce uint64x1_t greater_than_zero(int64x1_t a) { return vcgtz_s64(a); }
1243[[gnu::always_inline]] nce uint64x2_t greater_than_zero(int64x2_t a) { return vcgtzq_s64(a); }
1244[[gnu::always_inline]] nce uint64_t greater_than_zero(int64_t a) { return vcgtzd_s64(a); }
1245[[gnu::always_inline]] nce uint32_t greater_than_zero(float32_t a) { return vcgtzs_f32(a); }
1246[[gnu::always_inline]] nce uint64_t greater_than_zero(float64_t a) { return vcgtzd_f64(a); }
1247[[gnu::always_inline]] nce uint64x1_t less_than(int64x1_t a, int64x1_t b) { return vclt_s64(a, b); }
1248[[gnu::always_inline]] nce uint64x2_t less_than(int64x2_t a, int64x2_t b) { return vcltq_s64(a, b); }
1249[[gnu::always_inline]] nce uint64_t less_than(int64_t a, int64_t b) { return vcltd_s64(a, b); }
1250[[gnu::always_inline]] nce uint64_t less_than(uint64_t a, uint64_t b) { return vcltd_u64(a, b); }
1251[[gnu::always_inline]] nce uint32_t less_than(float32_t a, float32_t b) { return vclts_f32(a, b); }
1252[[gnu::always_inline]] nce uint64_t less_than(float64_t a, float64_t b) { return vcltd_f64(a, b); }
1253[[gnu::always_inline]] nce uint64x1_t less_than_zero(int64x1_t a) { return vcltz_s64(a); }
1254[[gnu::always_inline]] nce uint64x2_t less_than_zero(int64x2_t a) { return vcltzq_s64(a); }
1255[[gnu::always_inline]] nce uint64_t less_than_zero(int64_t a) { return vcltzd_s64(a); }
1256[[gnu::always_inline]] nce uint32_t less_than_zero(float32_t a) { return vcltzs_f32(a); }
1257[[gnu::always_inline]] nce uint64_t less_than_zero(float64_t a) { return vcltzd_f64(a); }
1258[[gnu::always_inline]] nce uint32_t absolute_greater_than_or_equal(float32_t a, float32_t b) { return vcages_f32(a, b); }
1259[[gnu::always_inline]] nce uint64_t absolute_greater_than_or_equal(float64_t a, float64_t b) { return vcaged_f64(a, b); }
1260[[gnu::always_inline]] nce uint32_t absolute_less_than_or_equal(float32_t a, float32_t b) { return vcales_f32(a, b); }
1261[[gnu::always_inline]] nce uint64_t absolute_less_than_or_equal(float64_t a, float64_t b) { return vcaled_f64(a, b); }
1262[[gnu::always_inline]] nce uint32_t absolute_greater_than(float32_t a, float32_t b) { return vcagts_f32(a, b); }
1263[[gnu::always_inline]] nce uint64_t absolute_greater_than(float64_t a, float64_t b) { return vcagtd_f64(a, b); }
1264[[gnu::always_inline]] nce uint32_t absolute_less_than(float32_t a, float32_t b) { return vcalts_f32(a, b); }
1265[[gnu::always_inline]] nce uint64_t absolute_less_than(float64_t a, float64_t b) { return vcaltd_f64(a, b); }
1266[[gnu::always_inline]] nce uint64x1_t compare_test_nonzero(int64x1_t a, int64x1_t b) { return vtst_s64(a, b); }
1267[[gnu::always_inline]] nce uint64x2_t compare_test_nonzero(int64x2_t a, int64x2_t b) { return vtstq_s64(a, b); }
1268[[gnu::always_inline]] nce uint64_t compare_test_nonzero(int64_t a, int64_t b) { return vtstd_s64(a, b); }
1269[[gnu::always_inline]] nce uint64_t compare_test_nonzero(uint64_t a, uint64_t b) { return vtstd_u64(a, b); }
1270[[gnu::always_inline]] nce int64_t shift_left(int64_t a, int64_t b) { return vshld_s64(a, b); }
1271[[gnu::always_inline]] nce uint64_t shift_left(uint64_t a, int64_t b) { return vshld_u64(a, b); }
1272template <int n>[[gnu::always_inline]] nce int64_t shift_left(int64_t a) { return vshld_n_s64(a, n); }
1273template <int n>[[gnu::always_inline]] nce uint64_t shift_left(uint64_t a) { return vshld_n_u64(a, n); }
1274[[gnu::always_inline]] nce int8_t shift_left_saturate(int8_t a, int8_t b) { return vqshlb_s8(a, b); }
1275[[gnu::always_inline]] nce int16_t shift_left_saturate(int16_t a, int16_t b) { return vqshlh_s16(a, b); }
1276[[gnu::always_inline]] nce int32_t shift_left_saturate(int32_t a, int32_t b) { return vqshls_s32(a, b); }
1277[[gnu::always_inline]] nce int64_t shift_left_saturate(int64_t a, int64_t b) { return vqshld_s64(a, b); }
1278[[gnu::always_inline]] nce uint8_t shift_left_saturate(uint8_t a, int8_t b) { return vqshlb_u8(a, b); }
1279[[gnu::always_inline]] nce uint16_t shift_left_saturate(uint16_t a, int16_t b) { return vqshlh_u16(a, b); }
1280[[gnu::always_inline]] nce uint32_t shift_left_saturate(uint32_t a, int32_t b) { return vqshls_u32(a, b); }
1281[[gnu::always_inline]] nce uint64_t shift_left_saturate(uint64_t a, int64_t b) { return vqshld_u64(a, b); }
1282template <int n>[[gnu::always_inline]] nce int8_t shift_left_saturate(int8_t a) { return vqshlb_n_s8(a, n); }
1283template <int n>[[gnu::always_inline]] nce int16_t shift_left_saturate(int16_t a) { return vqshlh_n_s16(a, n); }
1284template <int n>[[gnu::always_inline]] nce int32_t shift_left_saturate(int32_t a) { return vqshls_n_s32(a, n); }
1285template <int n>[[gnu::always_inline]] nce int64_t shift_left_saturate(int64_t a) { return vqshld_n_s64(a, n); }
1286template <int n>[[gnu::always_inline]] nce uint8_t shift_left_saturate(uint8_t a) { return vqshlb_n_u8(a, n); }
1287template <int n>[[gnu::always_inline]] nce uint16_t shift_left_saturate(uint16_t a) { return vqshlh_n_u16(a, n); }
1288template <int n>[[gnu::always_inline]] nce uint32_t shift_left_saturate(uint32_t a) { return vqshls_n_u32(a, n); }
1289template <int n>[[gnu::always_inline]] nce uint64_t shift_left_saturate(uint64_t a) { return vqshld_n_u64(a, n); }
1290template <int n>[[gnu::always_inline]] nce uint8_t shift_left_unsigned_saturate(int8_t a) { return vqshlub_n_s8(a, n); }
1291template <int n>[[gnu::always_inline]] nce uint16_t shift_left_unsigned_saturate(int16_t a) { return vqshluh_n_s16(a, n); }
1292template <int n>[[gnu::always_inline]] nce uint32_t shift_left_unsigned_saturate(int32_t a) { return vqshlus_n_s32(a, n); }
1293template <int n>[[gnu::always_inline]] nce uint64_t shift_left_unsigned_saturate(int64_t a) { return vqshlud_n_s64(a, n); }
1294[[gnu::always_inline]] nce int64_t shift_left_round(int64_t a, int64_t b) { return vrshld_s64(a, b); }
1295[[gnu::always_inline]] nce uint64_t shift_left_round(uint64_t a, int64_t b) { return vrshld_u64(a, b); }
1296[[gnu::always_inline]] nce int8_t shift_left_round_saturate(int8_t a, int8_t b) { return vqrshlb_s8(a, b); }
1297[[gnu::always_inline]] nce int16_t shift_left_round_saturate(int16_t a, int16_t b) { return vqrshlh_s16(a, b); }
1298[[gnu::always_inline]] nce int32_t shift_left_round_saturate(int32_t a, int32_t b) { return vqrshls_s32(a, b); }
1299[[gnu::always_inline]] nce int64_t shift_left_round_saturate(int64_t a, int64_t b) { return vqrshld_s64(a, b); }
1300[[gnu::always_inline]] nce uint8_t shift_left_round_saturate(uint8_t a, int8_t b) { return vqrshlb_u8(a, b); }
1301[[gnu::always_inline]] nce uint16_t shift_left_round_saturate(uint16_t a, int16_t b) { return vqrshlh_u16(a, b); }
1302[[gnu::always_inline]] nce uint32_t shift_left_round_saturate(uint32_t a, int32_t b) { return vqrshls_u32(a, b); }
1303[[gnu::always_inline]] nce uint64_t shift_left_round_saturate(uint64_t a, int64_t b) { return vqrshld_u64(a, b); }
1304template <int n>[[gnu::always_inline]] nce int64_t shift_left_insert(int64_t a, int64_t b) { return vslid_n_s64(a, b, n); }
1305template <int n>[[gnu::always_inline]] nce uint64_t shift_left_insert(uint64_t a, uint64_t b) { return vslid_n_u64(a, b, n); }
1306template <int n>[[gnu::always_inline]] nce int64_t shift_right(int64_t a) { return vshrd_n_s64(a, n); }
1307template <int n>[[gnu::always_inline]] nce uint64_t shift_right(uint64_t a) { return vshrd_n_u64(a, n); }
1308template <int n>[[gnu::always_inline]] nce int64_t shift_right_round(int64_t a) { return vrshrd_n_s64(a, n); }
1309template <int n>[[gnu::always_inline]] nce uint64_t shift_right_round(uint64_t a) { return vrshrd_n_u64(a, n); }
1310template <int n>[[gnu::always_inline]] nce int64_t shift_right_add(int64_t a, int64_t b) { return vsrad_n_s64(a, b, n); }
1311template <int n>[[gnu::always_inline]] nce uint64_t shift_right_add(uint64_t a, uint64_t b) { return vsrad_n_u64(a, b, n); }
1312template <int n>[[gnu::always_inline]] nce int64_t shift_right_accumulate_round(int64_t a, int64_t b) { return vrsrad_n_s64(a, b, n); }
1313template <int n>[[gnu::always_inline]] nce uint64_t shift_right_accumulate_round(uint64_t a, uint64_t b) { return vrsrad_n_u64(a, b, n); }
1314template <int n>[[gnu::always_inline]] nce uint8_t shift_right_saturate_narrow_unsigned(int16_t a) { return vqshrunh_n_s16(a, n); }
1315template <int n>[[gnu::always_inline]] nce uint16_t shift_right_saturate_narrow_unsigned(int32_t a) { return vqshruns_n_s32(a, n); }
1316template <int n>[[gnu::always_inline]] nce uint32_t shift_right_saturate_narrow_unsigned(int64_t a) { return vqshrund_n_s64(a, n); }
1317template <int n>[[gnu::always_inline]] nce int8_t shift_right_saturate_narrow(int16_t a) { return vqshrnh_n_s16(a, n); }
1318template <int n>[[gnu::always_inline]] nce int16_t shift_right_saturate_narrow(int32_t a) { return vqshrns_n_s32(a, n); }
1319template <int n>[[gnu::always_inline]] nce int32_t shift_right_saturate_narrow(int64_t a) { return vqshrnd_n_s64(a, n); }
1320template <int n>[[gnu::always_inline]] nce uint8_t shift_right_saturate_narrow(uint16_t a) { return vqshrnh_n_u16(a, n); }
1321template <int n>[[gnu::always_inline]] nce uint16_t shift_right_saturate_narrow(uint32_t a) { return vqshrns_n_u32(a, n); }
1322template <int n>[[gnu::always_inline]] nce uint32_t shift_right_saturate_narrow(uint64_t a) { return vqshrnd_n_u64(a, n); }
1323template <int n>[[gnu::always_inline]] nce uint8_t shift_right_unsigned_round_saturate_narrow(int16_t a) { return vqrshrunh_n_s16(a, n); }
1324template <int n>[[gnu::always_inline]] nce uint16_t shift_right_unsigned_round_saturate_narrow(int32_t a) { return vqrshruns_n_s32(a, n); }
1325template <int n>[[gnu::always_inline]] nce uint32_t shift_right_unsigned_round_saturate_narrow(int64_t a) { return vqrshrund_n_s64(a, n); }
1326template <int n>[[gnu::always_inline]] nce int8_t shift_right_round_saturate_narrow(int16_t a) { return vqrshrnh_n_s16(a, n); }
1327template <int n>[[gnu::always_inline]] nce int16_t shift_right_round_saturate_narrow(int32_t a) { return vqrshrns_n_s32(a, n); }
1328template <int n>[[gnu::always_inline]] nce int32_t shift_right_round_saturate_narrow(int64_t a) { return vqrshrnd_n_s64(a, n); }
1329template <int n>[[gnu::always_inline]] nce uint8_t shift_right_round_saturate_narrow(uint16_t a) { return vqrshrnh_n_u16(a, n); }
1330template <int n>[[gnu::always_inline]] nce uint16_t shift_right_round_saturate_narrow(uint32_t a) { return vqrshrns_n_u32(a, n); }
1331template <int n>[[gnu::always_inline]] nce uint32_t shift_right_round_saturate_narrow(uint64_t a) { return vqrshrnd_n_u64(a, n); }
1332template <int n>[[gnu::always_inline]] nce int64_t shift_right_insert(int64_t a, int64_t b) { return vsrid_n_s64(a, b, n); }
1333template <int n>[[gnu::always_inline]] nce uint64_t shift_right_insert(uint64_t a, uint64_t b) { return vsrid_n_u64(a, b, n); }
1334template <> [[gnu::always_inline]] nce int32_t convert(float32_t a) { return vcvts_s32_f32(a); }
1335template <> [[gnu::always_inline]] nce uint32_t convert(float32_t a) { return vcvts_u32_f32(a); }
1336template <> [[gnu::always_inline]] nce int32_t convert_round_to_nearest_with_ties_to_even(float32_t a) { return vcvtns_s32_f32(a); }
1337template <> [[gnu::always_inline]] nce uint32_t convert_round_to_nearest_with_ties_to_even(float32_t a) { return vcvtns_u32_f32(a); }
1338template <> [[gnu::always_inline]] nce int32_t convert_round_toward_negative_infinity(float32_t a) { return vcvtms_s32_f32(a); }
1339template <> [[gnu::always_inline]] nce uint32_t convert_round_toward_negative_infinity(float32_t a) { return vcvtms_u32_f32(a); }
1340template <> [[gnu::always_inline]] nce int32_t convert_round_toward_positive_infinity(float32_t a) { return vcvtps_s32_f32(a); }
1341template <> [[gnu::always_inline]] nce uint32_t convert_round_toward_positive_infinity(float32_t a) { return vcvtps_u32_f32(a); }
1342template <> [[gnu::always_inline]] nce int32_t convert_round_to_nearest_with_ties_away_from_zero(float32_t a) { return vcvtas_s32_f32(a); }
1343template <> [[gnu::always_inline]] nce uint32_t convert_round_to_nearest_with_ties_away_from_zero(float32_t a) { return vcvtas_u32_f32(a); }
1344template <> [[gnu::always_inline]] nce int64_t convert(float64_t a) { return vcvtd_s64_f64(a); }
1345template <> [[gnu::always_inline]] nce uint64_t convert(float64_t a) { return vcvtd_u64_f64(a); }
1346template <> [[gnu::always_inline]] nce int64_t convert_round_to_nearest_with_ties_to_even(float64_t a) { return vcvtnd_s64_f64(a); }
1347template <> [[gnu::always_inline]] nce uint64_t convert_round_to_nearest_with_ties_to_even(float64_t a) { return vcvtnd_u64_f64(a); }
1348template <> [[gnu::always_inline]] nce int64_t convert_round_toward_negative_infinity(float64_t a) { return vcvtmd_s64_f64(a); }
1349template <> [[gnu::always_inline]] nce uint64_t convert_round_toward_negative_infinity(float64_t a) { return vcvtmd_u64_f64(a); }
1350template <> [[gnu::always_inline]] nce int64_t convert_round_toward_positive_infinity(float64_t a) { return vcvtpd_s64_f64(a); }
1351template <> [[gnu::always_inline]] nce uint64_t convert_round_toward_positive_infinity(float64_t a) { return vcvtpd_u64_f64(a); }
1352template <> [[gnu::always_inline]] nce int64_t convert_round_to_nearest_with_ties_away_from_zero(float64_t a) { return vcvtad_s64_f64(a); }
1353template <> [[gnu::always_inline]] nce uint64_t convert_round_to_nearest_with_ties_away_from_zero(float64_t a) { return vcvtad_u64_f64(a); }
1354template <int n>[[gnu::always_inline]] nce int32_t convert(float32_t a) { return vcvts_n_s32_f32(a, n); }
1355template <int n>[[gnu::always_inline]] nce uint32_t convert(float32_t a) { return vcvts_n_u32_f32(a, n); }
1356template <int n>[[gnu::always_inline]] nce int64_t convert(float64_t a) { return vcvtd_n_s64_f64(a, n); }
1357template <int n>[[gnu::always_inline]] nce uint64_t convert(float64_t a) { return vcvtd_n_u64_f64(a, n); }
1358template <> [[gnu::always_inline]] nce float32_t convert(int32_t a) { return vcvts_f32_s32(a); }
1359template <> [[gnu::always_inline]] nce float32_t convert(uint32_t a) { return vcvts_f32_u32(a); }
1360template <> [[gnu::always_inline]] nce float64x1_t convert(int64x1_t a) { return vcvt_f64_s64(a); }
1361template <> [[gnu::always_inline]] nce float64x2_t convert(int64x2_t a) { return vcvtq_f64_s64(a); }
1362template <> [[gnu::always_inline]] nce float64_t convert(int64_t a) { return vcvtd_f64_s64(a); }
1363template <> [[gnu::always_inline]] nce float64_t convert(uint64_t a) { return vcvtd_f64_u64(a); }
1364template <int n>[[gnu::always_inline]] nce float32_t convert(int32_t a) { return vcvts_n_f32_s32(a, n); }
1365template <int n>[[gnu::always_inline]] nce float32_t convert(uint32_t a) { return vcvts_n_f32_u32(a, n); }
1366template <int n>[[gnu::always_inline]] nce float64x1_t convert(int64x1_t a) { return vcvt_n_f64_s64(a, n); }
1367template <int n>[[gnu::always_inline]] nce float64x2_t convert(int64x2_t a) { return vcvtq_n_f64_s64(a, n); }
1368template <int n>[[gnu::always_inline]] nce float64_t convert(int64_t a) { return vcvtd_n_f64_s64(a, n); }
1369template <int n>[[gnu::always_inline]] nce float64_t convert(uint64_t a) { return vcvtd_n_f64_u64(a, n); }
1370[[gnu::always_inline]] nce float32_t convert_round_to_odd(float64_t a) { return vcvtxd_f32_f64(a); }
1371template <> [[gnu::always_inline]] nce float64x1_t reinterpret(int64x1_t a) { return vreinterpret_f64_s64(a); }
1372[[gnu::always_inline]] nce float64x2_t reinterpret(poly8x16_t a) { return vreinterpretq_f64_p8(a); }
1373template <> [[gnu::always_inline]] nce float64x2_t reinterpret(poly16x8_t a) { return vreinterpretq_f64_p16(a); }
1374template <> [[gnu::always_inline]] nce float64x2_t reinterpret(int64x2_t a) { return vreinterpretq_f64_s64(a); }
1375[[gnu::always_inline]] nce float64x1_t reinterpret(poly64x1_t a) { return vreinterpret_f64_p64(a); }
1376template <> [[gnu::always_inline]] nce float64x2_t reinterpret(poly64x2_t a) { return vreinterpretq_f64_p64(a); }
1377template <> [[gnu::always_inline]] nce float64x2_t reinterpret(poly128_t a) { return vreinterpretq_f64_p128(a); }
1378[[gnu::always_inline]] nce int8_t move_saturate_narrow(int16_t a) { return vqmovnh_s16(a); }
1379[[gnu::always_inline]] nce int16_t move_saturate_narrow(int32_t a) { return vqmovns_s32(a); }
1380[[gnu::always_inline]] nce int32_t move_saturate_narrow(int64_t a) { return vqmovnd_s64(a); }
1381[[gnu::always_inline]] nce uint8_t move_saturate_narrow(uint16_t a) { return vqmovnh_u16(a); }
1382[[gnu::always_inline]] nce uint16_t move_saturate_narrow(uint32_t a) { return vqmovns_u32(a); }
1383[[gnu::always_inline]] nce uint32_t move_saturate_narrow(uint64_t a) { return vqmovnd_u64(a); }
1384[[gnu::always_inline]] nce uint8_t move_unsigned_saturate_narrow(int16_t a) { return vqmovunh_s16(a); }
1385[[gnu::always_inline]] nce uint16_t move_unsigned_saturate_narrow(int32_t a) { return vqmovuns_s32(a); }
1386[[gnu::always_inline]] nce uint32_t move_unsigned_saturate_narrow(int64_t a) { return vqmovund_s64(a); }
1387template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_add_long_lane_high(int64x2_t a, int32x4_t b, int32x2_t v) { return vmlal_high_lane_s32(a, b, v, lane); }
1388template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_add_long_lane(int64x2_t a, int32x2_t b, int32x4_t v) { return vmlal_laneq_s32(a, b, v, lane); }
1389template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_add_long_lane_high(int64x2_t a, int32x4_t b, int32x4_t v) { return vmlal_high_laneq_s32(a, b, v, lane); }
1390template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_subtract_long_lane_high(int64x2_t a, int32x4_t b, int32x2_t v) { return vmlsl_high_lane_s32(a, b, v, lane); }
1391template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_subtract_long_lane(int64x2_t a, int32x2_t b, int32x4_t v) { return vmlsl_laneq_s32(a, b, v, lane); }
1392template <int lane>[[gnu::always_inline]] nce int64x2_t multiply_subtract_long_lane_high(int64x2_t a, int32x4_t b, int32x4_t v) { return vmlsl_high_laneq_s32(a, b, v, lane); }
1393template <int lane>[[gnu::always_inline]] nce float32_t multiply_lane(float32_t a, float32x2_t v) { return vmuls_lane_f32(a, v, lane); }
1394template <int lane>[[gnu::always_inline]] nce float64_t multiply_lane(float64_t a, float64x1_t v) { return vmuld_lane_f64(a, v, lane); }
1395template <int lane>[[gnu::always_inline]] nce float32_t multiply_lane(float32_t a, float32x4_t v) { return vmuls_laneq_f32(a, v, lane); }
1396template <int lane>[[gnu::always_inline]] nce float64_t multiply_lane(float64_t a, float64x2_t v) { return vmuld_laneq_f64(a, v, lane); }
1397[[gnu::always_inline]] nce int64x2_t multiply_add_long_high(int64x2_t a, int32x4_t b, int32_t c) { return vmlal_high_n_s32(a, b, c); }
1398[[gnu::always_inline]] nce int64x2_t multiply_subtract_long_high(int64x2_t a, int32x4_t b, int32_t c) { return vmlsl_high_n_s32(a, b, c); }
1399[[gnu::always_inline]] nce int64x1_t negate(int64x1_t a) { return vneg_s64(a); }
1400[[gnu::always_inline]] nce int64_t negate(int64_t a) { return vnegd_s64(a); }
1401[[gnu::always_inline]] nce int64x2_t negate(int64x2_t a) { return vnegq_s64(a); }
1402[[gnu::always_inline]] nce int64x1_t negate_saturate(int64x1_t a) { return vqneg_s64(a); }
1403[[gnu::always_inline]] nce int64x2_t negate_saturate(int64x2_t a) { return vqnegq_s64(a); }
1404[[gnu::always_inline]] nce int8_t negate_saturate(int8_t a) { return vqnegb_s8(a); }
1405[[gnu::always_inline]] nce int16_t negate_saturate(int16_t a) { return vqnegh_s16(a); }
1406[[gnu::always_inline]] nce int32_t negate_saturate(int32_t a) { return vqnegs_s32(a); }
1407[[gnu::always_inline]] nce int64_t negate_saturate(int64_t a) { return vqnegd_s64(a); }
1408template <int lane1, int lane2>[[gnu::always_inline]] nce int64x1_t copy_lane(int64x1_t a, int64x1_t b) { return vcopy_lane_s64(a, lane1, b, lane2); }
1409template <int lane1, int lane2>[[gnu::always_inline]] nce int64x2_t copy_lane(int64x2_t a, int64x1_t b) { return vcopyq_lane_s64(a, lane1, b, lane2); }
1410template <int lane1, int lane2>[[gnu::always_inline]] nce poly8x16_t copy_lane(poly8x16_t a, poly8x8_t b) { return vcopyq_lane_p8(a, lane1, b, lane2); }
1411template <int lane1, int lane2>[[gnu::always_inline]] nce poly16x8_t copy_lane(poly16x8_t a, poly16x4_t b) { return vcopyq_lane_p16(a, lane1, b, lane2); }
1412template <int lane1, int lane2>[[gnu::always_inline]] nce int64x1_t copy_lane(int64x1_t a, int64x2_t b) { return vcopy_laneq_s64(a, lane1, b, lane2); }
1413template <int lane1, int lane2>[[gnu::always_inline]] nce int64x2_t copy_lane(int64x2_t a, int64x2_t b) { return vcopyq_laneq_s64(a, lane1, b, lane2); }
1414template <int lane1, int lane2>[[gnu::always_inline]] nce poly8x16_t copy_lane(poly8x16_t a, poly8x16_t b) { return vcopyq_laneq_p8(a, lane1, b, lane2); }
1415template <int lane1, int lane2>[[gnu::always_inline]] nce poly16x8_t copy_lane(poly16x8_t a, poly16x8_t b) { return vcopyq_laneq_p16(a, lane1, b, lane2); }
1416[[gnu::always_inline]] nce poly8x16_t reverse_bits(poly8x16_t a) { return vrbitq_p8(a); }
1417template <> [[gnu::always_inline]] nce float64x1_t create(uint64_t a) { return vcreate_f64(a); }
1418template <> [[gnu::always_inline]] nce float64x1_t duplicate(float64_t value) { return vdup_n_f64(value); }
1419template <> [[gnu::always_inline]] nce float64x2_t duplicate(float64_t value) { return vdupq_n_f64(value); }
1420template <> [[gnu::always_inline]] nce float64x1_t move(float64_t value) { return vmov_n_f64(value); }
1421template <> [[gnu::always_inline]] nce float64x2_t move(float64_t value) { return vmovq_n_f64(value); }
1422template <int lane>[[gnu::always_inline]] nce int64x1_t duplicate_lane(int64x2_t vec) { return vdup_laneq_s64(vec, lane); }
1423template <int lane>[[gnu::always_inline]] nce int64x2_t duplicate_lane_quad(int64x2_t vec) { return vdupq_laneq_s64(vec, lane); }
1424template <int lane>[[gnu::always_inline]] nce poly64x1_t duplicate_lane(poly64x2_t vec) { return vdup_laneq_p64(vec, lane); }
1425template <int lane>[[gnu::always_inline]] nce poly64x2_t duplicate_lane_quad(poly64x2_t vec) { return vdupq_laneq_p64(vec, lane); }
1426template <int lane>[[gnu::always_inline]] nce poly8x8_t duplicate_lane(poly8x16_t vec) { return vdup_laneq_p8(vec, lane); }
1427template <int lane>[[gnu::always_inline]] nce poly8x16_t duplicate_lane_quad(poly8x16_t vec) { return vdupq_laneq_p8(vec, lane); }
1428template <int lane>[[gnu::always_inline]] nce poly16x4_t duplicate_lane(poly16x8_t vec) { return vdup_laneq_p16(vec, lane); }
1429template <int lane>[[gnu::always_inline]] nce poly16x8_t duplicate_lane_quad(poly16x8_t vec) { return vdupq_laneq_p16(vec, lane); }
1430template <int lane>[[gnu::always_inline]] nce int64_t duplicate_lane(int64x1_t vec) { return vdupd_lane_s64(vec, lane); }
1431template <int lane>[[gnu::always_inline]] nce int64_t duplicate_lane(int64x2_t vec) { return vdupd_laneq_s64(vec, lane); }
1432[[gnu::always_inline]] nce int64x2_t zip1(int64x2_t a, int64x2_t b) { return vzip1q_s64(a, b); }
1433[[gnu::always_inline]] nce poly64x2_t zip1(poly64x2_t a, poly64x2_t b) { return vzip1q_p64(a, b); }
1434[[gnu::always_inline]] nce poly8x16_t zip1(poly8x16_t a, poly8x16_t b) { return vzip1q_p8(a, b); }
1435[[gnu::always_inline]] nce poly16x8_t zip1(poly16x8_t a, poly16x8_t b) { return vzip1q_p16(a, b); }
1436[[gnu::always_inline]] nce int64x2_t zip2(int64x2_t a, int64x2_t b) { return vzip2q_s64(a, b); }
1437[[gnu::always_inline]] nce poly64x2_t zip2(poly64x2_t a, poly64x2_t b) { return vzip2q_p64(a, b); }
1438[[gnu::always_inline]] nce poly8x16_t zip2(poly8x16_t a, poly8x16_t b) { return vzip2q_p8(a, b); }
1439[[gnu::always_inline]] nce poly16x8_t zip2(poly16x8_t a, poly16x8_t b) { return vzip2q_p16(a, b); }
1440[[gnu::always_inline]] nce int64x2_t unzip1(int64x2_t a, int64x2_t b) { return vuzp1q_s64(a, b); }
1441[[gnu::always_inline]] nce poly64x2_t unzip1(poly64x2_t a, poly64x2_t b) { return vuzp1q_p64(a, b); }
1442[[gnu::always_inline]] nce poly8x16_t unzip1(poly8x16_t a, poly8x16_t b) { return vuzp1q_p8(a, b); }
1443[[gnu::always_inline]] nce poly16x8_t unzip1(poly16x8_t a, poly16x8_t b) { return vuzp1q_p16(a, b); }
1444[[gnu::always_inline]] nce int64x2_t unzip2(int64x2_t a, int64x2_t b) { return vuzp2q_s64(a, b); }
1445[[gnu::always_inline]] nce poly64x2_t unzip2(poly64x2_t a, poly64x2_t b) { return vuzp2q_p64(a, b); }
1446[[gnu::always_inline]] nce poly8x16_t unzip2(poly8x16_t a, poly8x16_t b) { return vuzp2q_p8(a, b); }
1447[[gnu::always_inline]] nce poly16x8_t unzip2(poly16x8_t a, poly16x8_t b) { return vuzp2q_p16(a, b); }
1448[[gnu::always_inline]] nce int64x2_t transpose_step_1(int64x2_t a, int64x2_t b) { return vtrn1q_s64(a, b); }
1449[[gnu::always_inline]] nce poly64x2_t transpose_step_1(poly64x2_t a, poly64x2_t b) { return vtrn1q_p64(a, b); }
1450[[gnu::always_inline]] nce poly8x16_t transpose_step_1(poly8x16_t a, poly8x16_t b) { return vtrn1q_p8(a, b); }
1451[[gnu::always_inline]] nce poly16x8_t transpose_step_1(poly16x8_t a, poly16x8_t b) { return vtrn1q_p16(a, b); }
1452[[gnu::always_inline]] nce int64x2_t transpose_step_2(int64x2_t a, int64x2_t b) { return vtrn2q_s64(a, b); }
1453[[gnu::always_inline]] nce poly64x2_t transpose_step_2(poly64x2_t a, poly64x2_t b) { return vtrn2q_p64(a, b); }
1454[[gnu::always_inline]] nce poly8x16_t transpose_step_2(poly8x16_t a, poly8x16_t b) { return vtrn2q_p8(a, b); }
1455[[gnu::always_inline]] nce poly16x8_t transpose_step_2(poly16x8_t a, poly16x8_t b) { return vtrn2q_p16(a, b); }
1456template <int lane>[[gnu::always_inline]] nce float64x1_t set_lane(float64_t a, float64x1_t v) { return vset_lane_f64(a, v, lane); }
1457template <int lane>[[gnu::always_inline]] nce float64x2_t set_lane(float64_t a, float64x2_t v) { return vsetq_lane_f64(a, v, lane); }
1458template <> [[gnu::always_inline]] inline float64x1_t load1(float64_t const *ptr) { return vld1_f64(ptr); }
1459template <> [[gnu::always_inline]] inline float64x2_t load1(float64_t const *ptr) { return vld1q_f64(ptr); }
1460template <int lane>[[gnu::always_inline]] nce float64x1_t load1_lane(float64_t const *ptr, float64x1_t src) { return vld1_lane_f64(ptr, src, lane); }
1461template <int lane>[[gnu::always_inline]] nce float64x2_t load1_lane(float64_t const *ptr, float64x2_t src) { return vld1q_lane_f64(ptr, src, lane); }
1462template <int lane>[[gnu::always_inline]] nce uint64x1_t load_acquire1_lane(uint64_t const *ptr, uint64x1_t src) { return vldap1_lane_u64(ptr, src, lane); }
1463template <int lane>[[gnu::always_inline]] nce uint64x2_t load_acquire1_lane(uint64_t const *ptr, uint64x2_t src) { return vldap1q_lane_u64(ptr, src, lane); }
1464template <int lane>[[gnu::always_inline]] nce int64x1_t load_acquire1_lane(int64_t const *ptr, int64x1_t src) { return vldap1_lane_s64(ptr, src, lane); }
1465template <int lane>[[gnu::always_inline]] nce int64x2_t load_acquire1_lane(int64_t const *ptr, int64x2_t src) { return vldap1q_lane_s64(ptr, src, lane); }
1466template <int lane>[[gnu::always_inline]] nce float64x1_t load_acquire1_lane(float64_t const *ptr, float64x1_t src) { return vldap1_lane_f64(ptr, src, lane); }
1467template <int lane>[[gnu::always_inline]] nce float64x2_t load_acquire1_lane(float64_t const *ptr, float64x2_t src) { return vldap1q_lane_f64(ptr, src, lane); }
1468template <int lane>[[gnu::always_inline]] nce poly64x1_t load_acquire1_lane(poly64_t const *ptr, poly64x1_t src) { return vldap1_lane_p64(ptr, src, lane); }
1469template <int lane>[[gnu::always_inline]] nce poly64x2_t load_acquire1_lane(poly64_t const *ptr, poly64x2_t src) { return vldap1q_lane_p64(ptr, src, lane); }
1470template <> [[gnu::always_inline]] inline float64x1_t load1_duplicate(float64_t const *ptr) { return vld1_dup_f64(ptr); }
1471template <> [[gnu::always_inline]] inline float64x2_t load1_duplicate(float64_t const *ptr) { return vld1q_dup_f64(ptr); }
1472template <int lane>[[gnu::always_inline]] nce void store_release1_lane(uint64_t *ptr, uint64x1_t val) { return vstl1_lane_u64(ptr, val, lane); }
1473template <int lane>[[gnu::always_inline]] nce void store_release1_lane(uint64_t *ptr, uint64x2_t val) { return vstl1q_lane_u64(ptr, val, lane); }
1474template <int lane>[[gnu::always_inline]] nce void store_release1_lane(int64_t *ptr, int64x1_t val) { return vstl1_lane_s64(ptr, val, lane); }
1475template <int lane>[[gnu::always_inline]] nce void store_release1_lane(int64_t *ptr, int64x2_t val) { return vstl1q_lane_s64(ptr, val, lane); }
1476template <int lane>[[gnu::always_inline]] nce void store_release1_lane(float64_t *ptr, float64x1_t val) { return vstl1_lane_f64(ptr, val, lane); }
1477template <int lane>[[gnu::always_inline]] nce void store_release1_lane(float64_t *ptr, float64x2_t val) { return vstl1q_lane_f64(ptr, val, lane); }
1478template <int lane>[[gnu::always_inline]] nce void store_release1_lane(poly64_t *ptr, poly64x1_t val) { return vstl1_lane_p64(ptr, val, lane); }
1479template <int lane>[[gnu::always_inline]] nce void store_release1_lane(poly64_t *ptr, poly64x2_t val) { return vstl1q_lane_p64(ptr, val, lane); }
1480template <> [[gnu::always_inline]] inline int64x2x2_t load2(int64_t const *ptr) { return vld2q_s64(ptr); }
1481template <> [[gnu::always_inline]] inline uint64x2x2_t load2(uint64_t const *ptr) { return vld2q_u64(ptr); }
1482template <> [[gnu::always_inline]] inline poly64x2x2_t load2(poly64_t const *ptr) { return vld2q_p64(ptr); }
1483template <> [[gnu::always_inline]] inline float64x1x2_t load2(float64_t const *ptr) { return vld2_f64(ptr); }
1484template <> [[gnu::always_inline]] inline float64x2x2_t load2(float64_t const *ptr) { return vld2q_f64(ptr); }
1485template <> [[gnu::always_inline]] inline int64x2x3_t load3(int64_t const *ptr) { return vld3q_s64(ptr); }
1486template <> [[gnu::always_inline]] inline uint64x2x3_t load3(uint64_t const *ptr) { return vld3q_u64(ptr); }
1487template <> [[gnu::always_inline]] inline poly64x2x3_t load3(poly64_t const *ptr) { return vld3q_p64(ptr); }
1488template <> [[gnu::always_inline]] inline float64x1x3_t load3(float64_t const *ptr) { return vld3_f64(ptr); }
1489template <> [[gnu::always_inline]] inline float64x2x3_t load3(float64_t const *ptr) { return vld3q_f64(ptr); }
1490template <> [[gnu::always_inline]] inline int64x2x4_t load4(int64_t const *ptr) { return vld4q_s64(ptr); }
1491template <> [[gnu::always_inline]] inline uint64x2x4_t load4(uint64_t const *ptr) { return vld4q_u64(ptr); }
1492template <> [[gnu::always_inline]] inline poly64x2x4_t load4(poly64_t const *ptr) { return vld4q_p64(ptr); }
1493template <> [[gnu::always_inline]] inline float64x1x4_t load4(float64_t const *ptr) { return vld4_f64(ptr); }
1494template <> [[gnu::always_inline]] inline float64x2x4_t load4(float64_t const *ptr) { return vld4q_f64(ptr); }
1495template <> [[gnu::always_inline]] inline int64x2x2_t load2_duplicate(int64_t const *ptr) { return vld2q_dup_s64(ptr); }
1496template <> [[gnu::always_inline]] inline uint64x2x2_t load2_duplicate(uint64_t const *ptr) { return vld2q_dup_u64(ptr); }
1497template <> [[gnu::always_inline]] inline poly64x2x2_t load2_duplicate(poly64_t const *ptr) { return vld2q_dup_p64(ptr); }
1498template <> [[gnu::always_inline]] inline float64x1x2_t load2_duplicate(float64_t const *ptr) { return vld2_dup_f64(ptr); }
1499template <> [[gnu::always_inline]] inline float64x2x2_t load2_duplicate(float64_t const *ptr) { return vld2q_dup_f64(ptr); }
1500template <> [[gnu::always_inline]] inline int64x2x3_t load3_duplicate(int64_t const *ptr) { return vld3q_dup_s64(ptr); }
1501template <> [[gnu::always_inline]] inline uint64x2x3_t load3_duplicate(uint64_t const *ptr) { return vld3q_dup_u64(ptr); }
1502template <> [[gnu::always_inline]] inline poly64x2x3_t load3_duplicate(poly64_t const *ptr) { return vld3q_dup_p64(ptr); }
1503template <> [[gnu::always_inline]] inline float64x1x3_t load3_duplicate(float64_t const *ptr) { return vld3_dup_f64(ptr); }
1504template <> [[gnu::always_inline]] inline float64x2x3_t load3_duplicate(float64_t const *ptr) { return vld3q_dup_f64(ptr); }
1505template <> [[gnu::always_inline]] inline int64x2x4_t load4_duplicate(int64_t const *ptr) { return vld4q_dup_s64(ptr); }
1506template <> [[gnu::always_inline]] inline uint64x2x4_t load4_duplicate(uint64_t const *ptr) { return vld4q_dup_u64(ptr); }
1507template <> [[gnu::always_inline]] inline poly64x2x4_t load4_duplicate(poly64_t const *ptr) { return vld4q_dup_p64(ptr); }
1508template <> [[gnu::always_inline]] inline float64x1x4_t load4_duplicate(float64_t const *ptr) { return vld4_dup_f64(ptr); }
1509template <> [[gnu::always_inline]] inline float64x2x4_t load4_duplicate(float64_t const *ptr) { return vld4q_dup_f64(ptr); }
1510template <int lane>[[gnu::always_inline]] nce int8x16x2_t load2_lane(int8_t const *ptr, int8x16x2_t src) { return vld2q_lane_s8(ptr, src, lane); }
1511template <int lane>[[gnu::always_inline]] nce uint8x16x2_t load2_lane(uint8_t const *ptr, uint8x16x2_t src) { return vld2q_lane_u8(ptr, src, lane); }
1512template <int lane>[[gnu::always_inline]] nce poly8x16x2_t load2_lane(poly8_t const *ptr, poly8x16x2_t src) { return vld2q_lane_p8(ptr, src, lane); }
1513template <int lane>[[gnu::always_inline]] nce int64x1x2_t load2_lane(int64_t const *ptr, int64x1x2_t src) { return vld2_lane_s64(ptr, src, lane); }
1514template <int lane>[[gnu::always_inline]] nce int64x2x2_t load2_lane(int64_t const *ptr, int64x2x2_t src) { return vld2q_lane_s64(ptr, src, lane); }
1515template <int lane>[[gnu::always_inline]] nce uint64x1x2_t load2_lane(uint64_t const *ptr, uint64x1x2_t src) { return vld2_lane_u64(ptr, src, lane); }
1516template <int lane>[[gnu::always_inline]] nce uint64x2x2_t load2_lane(uint64_t const *ptr, uint64x2x2_t src) { return vld2q_lane_u64(ptr, src, lane); }
1517template <int lane>[[gnu::always_inline]] nce poly64x1x2_t load2_lane(poly64_t const *ptr, poly64x1x2_t src) { return vld2_lane_p64(ptr, src, lane); }
1518template <int lane>[[gnu::always_inline]] nce poly64x2x2_t load2_lane(poly64_t const *ptr, poly64x2x2_t src) { return vld2q_lane_p64(ptr, src, lane); }
1519template <int lane>[[gnu::always_inline]] nce float64x1x2_t load2_lane(float64_t const *ptr, float64x1x2_t src) { return vld2_lane_f64(ptr, src, lane); }
1520template <int lane>[[gnu::always_inline]] nce float64x2x2_t load2_lane(float64_t const *ptr, float64x2x2_t src) { return vld2q_lane_f64(ptr, src, lane); }
1521template <int lane>[[gnu::always_inline]] nce int8x16x3_t load3_lane(int8_t const *ptr, int8x16x3_t src) { return vld3q_lane_s8(ptr, src, lane); }
1522template <int lane>[[gnu::always_inline]] nce uint8x16x3_t load3_lane(uint8_t const *ptr, uint8x16x3_t src) { return vld3q_lane_u8(ptr, src, lane); }
1523template <int lane>[[gnu::always_inline]] nce poly8x16x3_t load3_lane(poly8_t const *ptr, poly8x16x3_t src) { return vld3q_lane_p8(ptr, src, lane); }
1524template <int lane>[[gnu::always_inline]] nce int64x1x3_t load3_lane(int64_t const *ptr, int64x1x3_t src) { return vld3_lane_s64(ptr, src, lane); }
1525template <int lane>[[gnu::always_inline]] nce int64x2x3_t load3_lane(int64_t const *ptr, int64x2x3_t src) { return vld3q_lane_s64(ptr, src, lane); }
1526template <int lane>[[gnu::always_inline]] nce uint64x1x3_t load3_lane(uint64_t const *ptr, uint64x1x3_t src) { return vld3_lane_u64(ptr, src, lane); }
1527template <int lane>[[gnu::always_inline]] nce uint64x2x3_t load3_lane(uint64_t const *ptr, uint64x2x3_t src) { return vld3q_lane_u64(ptr, src, lane); }
1528template <int lane>[[gnu::always_inline]] nce poly64x1x3_t load3_lane(poly64_t const *ptr, poly64x1x3_t src) { return vld3_lane_p64(ptr, src, lane); }
1529template <int lane>[[gnu::always_inline]] nce poly64x2x3_t load3_lane(poly64_t const *ptr, poly64x2x3_t src) { return vld3q_lane_p64(ptr, src, lane); }
1530template <int lane>[[gnu::always_inline]] nce float64x1x3_t load3_lane(float64_t const *ptr, float64x1x3_t src) { return vld3_lane_f64(ptr, src, lane); }
1531template <int lane>[[gnu::always_inline]] nce float64x2x3_t load3_lane(float64_t const *ptr, float64x2x3_t src) { return vld3q_lane_f64(ptr, src, lane); }
1532template <int lane>[[gnu::always_inline]] nce int8x16x4_t load4_lane(int8_t const *ptr, int8x16x4_t src) { return vld4q_lane_s8(ptr, src, lane); }
1533template <int lane>[[gnu::always_inline]] nce uint8x16x4_t load4_lane(uint8_t const *ptr, uint8x16x4_t src) { return vld4q_lane_u8(ptr, src, lane); }
1534template <int lane>[[gnu::always_inline]] nce poly8x16x4_t load4_lane(poly8_t const *ptr, poly8x16x4_t src) { return vld4q_lane_p8(ptr, src, lane); }
1535template <int lane>[[gnu::always_inline]] nce int64x1x4_t load4_lane(int64_t const *ptr, int64x1x4_t src) { return vld4_lane_s64(ptr, src, lane); }
1536template <int lane>[[gnu::always_inline]] nce int64x2x4_t load4_lane(int64_t const *ptr, int64x2x4_t src) { return vld4q_lane_s64(ptr, src, lane); }
1537template <int lane>[[gnu::always_inline]] nce uint64x1x4_t load4_lane(uint64_t const *ptr, uint64x1x4_t src) { return vld4_lane_u64(ptr, src, lane); }
1538template <int lane>[[gnu::always_inline]] nce uint64x2x4_t load4_lane(uint64_t const *ptr, uint64x2x4_t src) { return vld4q_lane_u64(ptr, src, lane); }
1539template <int lane>[[gnu::always_inline]] nce poly64x1x4_t load4_lane(poly64_t const *ptr, poly64x1x4_t src) { return vld4_lane_p64(ptr, src, lane); }
1540template <int lane>[[gnu::always_inline]] nce poly64x2x4_t load4_lane(poly64_t const *ptr, poly64x2x4_t src) { return vld4q_lane_p64(ptr, src, lane); }
1541template <int lane>[[gnu::always_inline]] nce float64x1x4_t load4_lane(float64_t const *ptr, float64x1x4_t src) { return vld4_lane_f64(ptr, src, lane); }
1542template <int lane>[[gnu::always_inline]] nce float64x2x4_t load4_lane(float64_t const *ptr, float64x2x4_t src) { return vld4q_lane_f64(ptr, src, lane); }
1543template <> [[gnu::always_inline]] inline float64x1x2_t load1_x2(float64_t const *ptr) { return vld1_f64_x2(ptr); }
1544template <> [[gnu::always_inline]] inline float64x2x2_t load1_x2(float64_t const *ptr) { return vld1q_f64_x2(ptr); }
1545template <> [[gnu::always_inline]] inline float64x1x3_t load1_x3(float64_t const *ptr) { return vld1_f64_x3(ptr); }
1546template <> [[gnu::always_inline]] inline float64x2x3_t load1_x3(float64_t const *ptr) { return vld1q_f64_x3(ptr); }
1547template <> [[gnu::always_inline]] inline float64x1x4_t load1_x4(float64_t const *ptr) { return vld1_f64_x4(ptr); }
1548template <> [[gnu::always_inline]] inline float64x2x4_t load1_x4(float64_t const *ptr) { return vld1q_f64_x4(ptr); }
1549template <> [[gnu::always_inline]] inline void store1(float64_t *ptr, float64x1_t val) { return vst1_f64(ptr, val); }
1550template <> [[gnu::always_inline]] inline void store1(float64_t *ptr, float64x2_t val) { return vst1q_f64(ptr, val); }
1551template <int lane>[[gnu::always_inline]] nce void store1_lane(float64_t *ptr, float64x1_t val) { return vst1_lane_f64(ptr, val, lane); }
1552template <int lane>[[gnu::always_inline]] nce void store1_lane(float64_t *ptr, float64x2_t val) { return vst1q_lane_f64(ptr, val, lane); }
1553template <> [[gnu::always_inline]] inline void store2(int64_t *ptr, int64x2x2_t val) { return vst2q_s64(ptr, val); }
1554template <> [[gnu::always_inline]] inline void store2(uint64_t *ptr, uint64x2x2_t val) { return vst2q_u64(ptr, val); }
1555template <> [[gnu::always_inline]] inline void store2(poly64_t *ptr, poly64x2x2_t val) { return vst2q_p64(ptr, val); }
1556template <> [[gnu::always_inline]] inline void store2(float64_t *ptr, float64x1x2_t val) { return vst2_f64(ptr, val); }
1557template <> [[gnu::always_inline]] inline void store2(float64_t *ptr, float64x2x2_t val) { return vst2q_f64(ptr, val); }
1558template <> [[gnu::always_inline]] inline void store3(int64_t *ptr, int64x2x3_t val) { return vst3q_s64(ptr, val); }
1559template <> [[gnu::always_inline]] inline void store3(uint64_t *ptr, uint64x2x3_t val) { return vst3q_u64(ptr, val); }
1560template <> [[gnu::always_inline]] inline void store3(poly64_t *ptr, poly64x2x3_t val) { return vst3q_p64(ptr, val); }
1561template <> [[gnu::always_inline]] inline void store3(float64_t *ptr, float64x1x3_t val) { return vst3_f64(ptr, val); }
1562template <> [[gnu::always_inline]] inline void store3(float64_t *ptr, float64x2x3_t val) { return vst3q_f64(ptr, val); }
1563template <> [[gnu::always_inline]] inline void store4(int64_t *ptr, int64x2x4_t val) { return vst4q_s64(ptr, val); }
1564template <> [[gnu::always_inline]] inline void store4(uint64_t *ptr, uint64x2x4_t val) { return vst4q_u64(ptr, val); }
1565template <> [[gnu::always_inline]] inline void store4(poly64_t *ptr, poly64x2x4_t val) { return vst4q_p64(ptr, val); }
1566template <> [[gnu::always_inline]] inline void store4(float64_t *ptr, float64x1x4_t val) { return vst4_f64(ptr, val); }
1567template <> [[gnu::always_inline]] inline void store4(float64_t *ptr, float64x2x4_t val) { return vst4q_f64(ptr, val); }
1568template <int lane>[[gnu::always_inline]] nce void store2_lane(int8_t *ptr, int8x16x2_t val) { return vst2q_lane_s8(ptr, val, lane); }
1569template <int lane>[[gnu::always_inline]] nce void store2_lane(uint8_t *ptr, uint8x16x2_t val) { return vst2q_lane_u8(ptr, val, lane); }
1570template <int lane>[[gnu::always_inline]] nce void store2_lane(poly8_t *ptr, poly8x16x2_t val) { return vst2q_lane_p8(ptr, val, lane); }
1571template <int lane>[[gnu::always_inline]] nce void store2_lane(int64_t *ptr, int64x1x2_t val) { return vst2_lane_s64(ptr, val, lane); }
1572template <int lane>[[gnu::always_inline]] nce void store2_lane(int64_t *ptr, int64x2x2_t val) { return vst2q_lane_s64(ptr, val, lane); }
1573template <int lane>[[gnu::always_inline]] nce void store2_lane(uint64_t *ptr, uint64x1x2_t val) { return vst2_lane_u64(ptr, val, lane); }
1574template <int lane>[[gnu::always_inline]] nce void store2_lane(uint64_t *ptr, uint64x2x2_t val) { return vst2q_lane_u64(ptr, val, lane); }
1575template <int lane>[[gnu::always_inline]] nce void store2_lane(poly64_t *ptr, poly64x1x2_t val) { return vst2_lane_p64(ptr, val, lane); }
1576template <int lane>[[gnu::always_inline]] nce void store2_lane(poly64_t *ptr, poly64x2x2_t val) { return vst2q_lane_p64(ptr, val, lane); }
1577template <int lane>[[gnu::always_inline]] nce void store2_lane(float64_t *ptr, float64x1x2_t val) { return vst2_lane_f64(ptr, val, lane); }
1578template <int lane>[[gnu::always_inline]] nce void store2_lane(float64_t *ptr, float64x2x2_t val) { return vst2q_lane_f64(ptr, val, lane); }
1579template <int lane>[[gnu::always_inline]] nce void store3_lane(int64_t *ptr, int64x1x3_t val) { return vst3_lane_s64(ptr, val, lane); }
1580template <int lane>[[gnu::always_inline]] nce void store3_lane(int64_t *ptr, int64x2x3_t val) { return vst3q_lane_s64(ptr, val, lane); }
1581template <int lane>[[gnu::always_inline]] nce void store3_lane(uint64_t *ptr, uint64x1x3_t val) { return vst3_lane_u64(ptr, val, lane); }
1582template <int lane>[[gnu::always_inline]] nce void store3_lane(uint64_t *ptr, uint64x2x3_t val) { return vst3q_lane_u64(ptr, val, lane); }
1583template <int lane>[[gnu::always_inline]] nce void store3_lane(poly64_t *ptr, poly64x1x3_t val) { return vst3_lane_p64(ptr, val, lane); }
1584template <int lane>[[gnu::always_inline]] nce void store3_lane(poly64_t *ptr, poly64x2x3_t val) { return vst3q_lane_p64(ptr, val, lane); }
1585template <int lane>[[gnu::always_inline]] nce void store3_lane(float64_t *ptr, float64x1x3_t val) { return vst3_lane_f64(ptr, val, lane); }
1586template <int lane>[[gnu::always_inline]] nce void store3_lane(float64_t *ptr, float64x2x3_t val) { return vst3q_lane_f64(ptr, val, lane); }
1587template <int lane>[[gnu::always_inline]] nce void store4_lane(int8_t *ptr, int8x16x4_t val) { return vst4q_lane_s8(ptr, val, lane); }
1588template <int lane>[[gnu::always_inline]] nce void store4_lane(uint8_t *ptr, uint8x16x4_t val) { return vst4q_lane_u8(ptr, val, lane); }
1589template <int lane>[[gnu::always_inline]] nce void store4_lane(poly8_t *ptr, poly8x16x4_t val) { return vst4q_lane_p8(ptr, val, lane); }
1590template <int lane>[[gnu::always_inline]] nce void store4_lane(int64_t *ptr, int64x1x4_t val) { return vst4_lane_s64(ptr, val, lane); }
1591template <int lane>[[gnu::always_inline]] nce void store4_lane(int64_t *ptr, int64x2x4_t val) { return vst4q_lane_s64(ptr, val, lane); }
1592template <int lane>[[gnu::always_inline]] nce void store4_lane(uint64_t *ptr, uint64x1x4_t val) { return vst4_lane_u64(ptr, val, lane); }
1593template <int lane>[[gnu::always_inline]] nce void store4_lane(uint64_t *ptr, uint64x2x4_t val) { return vst4q_lane_u64(ptr, val, lane); }
1594template <int lane>[[gnu::always_inline]] nce void store4_lane(poly64_t *ptr, poly64x1x4_t val) { return vst4_lane_p64(ptr, val, lane); }
1595template <int lane>[[gnu::always_inline]] nce void store4_lane(poly64_t *ptr, poly64x2x4_t val) { return vst4q_lane_p64(ptr, val, lane); }
1596template <int lane>[[gnu::always_inline]] nce void store4_lane(float64_t *ptr, float64x1x4_t val) { return vst4_lane_f64(ptr, val, lane); }
1597template <int lane>[[gnu::always_inline]] nce void store4_lane(float64_t *ptr, float64x2x4_t val) { return vst4q_lane_f64(ptr, val, lane); }
1598[[gnu::always_inline]] inline void store1_x2(float64_t *ptr, float64x1x2_t val) { return vst1_f64_x2(ptr, val); }
1599[[gnu::always_inline]] inline void store1_x2(float64_t *ptr, float64x2x2_t val) { return vst1q_f64_x2(ptr, val); }
1600[[gnu::always_inline]] inline void store1_x3(float64_t *ptr, float64x1x3_t val) { return vst1_f64_x3(ptr, val); }
1601[[gnu::always_inline]] inline void store1_x3(float64_t *ptr, float64x2x3_t val) { return vst1q_f64_x3(ptr, val); }
1602[[gnu::always_inline]] inline void store1_x4(float64_t *ptr, float64x1x4_t val) { return vst1_f64_x4(ptr, val); }
1603[[gnu::always_inline]] inline void store1_x4(float64_t *ptr, float64x2x4_t val) { return vst1q_f64_x4(ptr, val); }
1604[[gnu::always_inline]] nce poly8x8_t table_lookup1_saturate(poly8x16_t t, uint8x8_t idx) { return vqtbl1_p8(t, idx); }
1605[[gnu::always_inline]] nce poly8x16_t table_lookup1_saturate(poly8x16_t t, uint8x16_t idx) { return vqtbl1q_p8(t, idx); }
1606[[gnu::always_inline]] nce int8x8_t table_lookup2_saturate(int8x16x2_t t, uint8x8_t idx) { return vqtbl2_s8(t, idx); }
1607[[gnu::always_inline]] nce int8x16_t table_lookup2_saturate(int8x16x2_t t, uint8x16_t idx) { return vqtbl2q_s8(t, idx); }
1608[[gnu::always_inline]] nce uint8x8_t table_lookup2_saturate(uint8x16x2_t t, uint8x8_t idx) { return vqtbl2_u8(t, idx); }
1609[[gnu::always_inline]] nce uint8x16_t table_lookup2_saturate(uint8x16x2_t t, uint8x16_t idx) { return vqtbl2q_u8(t, idx); }
1610[[gnu::always_inline]] nce poly8x8_t table_lookup2_saturate(poly8x16x2_t t, uint8x8_t idx) { return vqtbl2_p8(t, idx); }
1611[[gnu::always_inline]] nce poly8x16_t table_lookup2_saturate(poly8x16x2_t t, uint8x16_t idx) { return vqtbl2q_p8(t, idx); }
1612[[gnu::always_inline]] nce int8x8_t table_lookup3_saturate(int8x16x3_t t, uint8x8_t idx) { return vqtbl3_s8(t, idx); }
1613[[gnu::always_inline]] nce int8x16_t table_lookup3_saturate(int8x16x3_t t, uint8x16_t idx) { return vqtbl3q_s8(t, idx); }
1614[[gnu::always_inline]] nce uint8x8_t table_lookup3_saturate(uint8x16x3_t t, uint8x8_t idx) { return vqtbl3_u8(t, idx); }
1615[[gnu::always_inline]] nce uint8x16_t table_lookup3_saturate(uint8x16x3_t t, uint8x16_t idx) { return vqtbl3q_u8(t, idx); }
1616[[gnu::always_inline]] nce poly8x8_t table_lookup3_saturate(poly8x16x3_t t, uint8x8_t idx) { return vqtbl3_p8(t, idx); }
1617[[gnu::always_inline]] nce poly8x16_t table_lookup3_saturate(poly8x16x3_t t, uint8x16_t idx) { return vqtbl3q_p8(t, idx); }
1618[[gnu::always_inline]] nce int8x8_t table_lookup4_saturate(int8x16x4_t t, uint8x8_t idx) { return vqtbl4_s8(t, idx); }
1619[[gnu::always_inline]] nce int8x16_t table_lookup4_saturate(int8x16x4_t t, uint8x16_t idx) { return vqtbl4q_s8(t, idx); }
1620[[gnu::always_inline]] nce uint8x8_t table_lookup4_saturate(uint8x16x4_t t, uint8x8_t idx) { return vqtbl4_u8(t, idx); }
1621[[gnu::always_inline]] nce uint8x16_t table_lookup4_saturate(uint8x16x4_t t, uint8x16_t idx) { return vqtbl4q_u8(t, idx); }
1622[[gnu::always_inline]] nce poly8x8_t table_lookup4_saturate(poly8x16x4_t t, uint8x8_t idx) { return vqtbl4_p8(t, idx); }
1623[[gnu::always_inline]] nce poly8x16_t table_lookup4_saturate(poly8x16x4_t t, uint8x16_t idx) { return vqtbl4q_p8(t, idx); }
1624[[gnu::always_inline]] nce poly8x16_t table_extend1_saturate(poly8x16_t a, poly8x16_t t, uint8x16_t idx) { return vqtbx1q_p8(a, t, idx); }
1625[[gnu::always_inline]] nce poly8x16_t table_extend2_saturate(poly8x16_t a, poly8x16x2_t t, uint8x16_t idx) { return vqtbx2q_p8(a, t, idx); }
1626[[gnu::always_inline]] nce poly8x16_t table_extend3_saturate(poly8x16_t a, poly8x16x3_t t, uint8x16_t idx) { return vqtbx3q_p8(a, t, idx); }
1627[[gnu::always_inline]] nce poly8x16_t table_extend4_saturate(poly8x16_t a, poly8x16x4_t t, uint8x16_t idx) { return vqtbx4q_p8(a, t, idx); }
1628#ifdef __ARM_FEATURE_LUT
1629template <int index>[[gnu::always_inline]] nce uint8x16_t lookup_table_2bit_index_lane(uint8x8_t vn, uint8x8_t vm) { return vluti2_lane_u8(vn, vm, index); }
1630template <int index>[[gnu::always_inline]] nce uint8x16_t lookup_table_2bit_index_lane(uint8x8_t vn, uint8x16_t vm) { return vluti2_laneq_u8(vn, vm, index); }
1631template <int index>[[gnu::always_inline]] nce uint8x16_t lookup_table_2bit_index_lane(uint8x16_t vn, uint8x8_t vm) { return vluti2q_lane_u8(vn, vm, index); }
1632template <int index>[[gnu::always_inline]] nce uint8x16_t lookup_table_4bit_index_lane(uint8x16_t vn, uint8x8_t vm) { return vluti4q_lane_u8(vn, vm, index); }
1633template <int index>[[gnu::always_inline]] nce uint8x16_t lookup_table_2bit_index_lane(uint8x16_t vn, uint8x16_t vm) { return vluti2q_laneq_u8(vn, vm, index); }
1634template <int index>[[gnu::always_inline]] nce uint8x16_t lookup_table_4bit_index_lane(uint8x16_t vn, uint8x16_t vm) { return vluti4q_laneq_u8(vn, vm, index); }
1635template <int index>[[gnu::always_inline]] nce int8x16_t lookup_table_2bit_index_lane(int8x8_t vn, uint8x8_t vm) { return vluti2_lane_s8(vn, vm, index); }
1636template <int index>[[gnu::always_inline]] nce int8x16_t lookup_table_2bit_index_lane(int8x8_t vn, uint8x16_t vm) { return vluti2_laneq_s8(vn, vm, index); }
1637template <int index>[[gnu::always_inline]] nce int8x16_t lookup_table_2bit_index_lane(int8x16_t vn, uint8x8_t vm) { return vluti2q_lane_s8(vn, vm, index); }
1638template <int index>[[gnu::always_inline]] nce int8x16_t lookup_table_4bit_index_lane(int8x16_t vn, uint8x8_t vm) { return vluti4q_lane_s8(vn, vm, index); }
1639template <int index>[[gnu::always_inline]] nce int8x16_t lookup_table_2bit_index_lane(int8x16_t vn, uint8x16_t vm) { return vluti2q_laneq_s8(vn, vm, index); }
1640template <int index>[[gnu::always_inline]] nce int8x16_t lookup_table_4bit_index_lane(int8x16_t vn, uint8x16_t vm) { return vluti4q_laneq_s8(vn, vm, index); }
1641template <int index>[[gnu::always_inline]] nce uint16x8_t lookup_table_2bit_index_lane(uint16x4_t vn, uint8x8_t vm) { return vluti2_lane_u16(vn, vm, index); }
1642template <int index>[[gnu::always_inline]] nce uint16x8_t lookup_table_2bit_index_lane(uint16x4_t vn, uint8x16_t vm) { return vluti2_laneq_u16(vn, vm, index); }
1643template <int index>[[gnu::always_inline]] nce uint16x8_t lookup_table_2bit_index_lane(uint16x8_t vn, uint8x8_t vm) { return vluti2q_lane_u16(vn, vm, index); }
1644template <int index>[[gnu::always_inline]] nce uint16x8_t lookup_table_2bit_index_lane(uint16x8_t vn, uint8x16_t vm) { return vluti2q_laneq_u16(vn, vm, index); }
1645template <int index>[[gnu::always_inline]] nce int16x8_t lookup_table_2bit_index_lane(int16x4_t vn, uint8x8_t vm) { return vluti2_lane_s16(vn, vm, index); }
1646template <int index>[[gnu::always_inline]] nce int16x8_t lookup_table_2bit_index_lane(int16x4_t vn, uint8x16_t vm) { return vluti2_laneq_s16(vn, vm, index); }
1647template <int index>[[gnu::always_inline]] nce int16x8_t lookup_table_2bit_index_lane(int16x8_t vn, uint8x8_t vm) { return vluti2q_lane_s16(vn, vm, index); }
1648template <int index>[[gnu::always_inline]] nce int16x8_t lookup_table_2bit_index_lane(int16x8_t vn, uint8x16_t vm) { return vluti2q_laneq_s16(vn, vm, index); }
1649template <int index>[[gnu::always_inline]] nce float16x8_t lookup_table_2bit_index_lane(float16x4_t vn, uint8x8_t vm) { return vluti2_lane_f16(vn, vm, index); }
1650template <int index>[[gnu::always_inline]] nce float16x8_t lookup_table_2bit_index_lane(float16x4_t vn, uint8x16_t vm) { return vluti2_laneq_f16(vn, vm, index); }
1651template <int index>[[gnu::always_inline]] nce float16x8_t lookup_table_2bit_index_lane(float16x8_t vn, uint8x8_t vm) { return vluti2q_lane_f16(vn, vm, index); }
1652template <int index>[[gnu::always_inline]] nce float16x8_t lookup_table_2bit_index_lane(float16x8_t vn, uint8x16_t vm) { return vluti2q_laneq_f16(vn, vm, index); }
1653template <int index>[[gnu::always_inline]] nce poly8x16_t lookup_table_2bit_index_lane(poly8x8_t vn, uint8x8_t vm) { return vluti2_lane_p8(vn, vm, index); }
1654template <int index>[[gnu::always_inline]] nce poly8x16_t lookup_table_2bit_index_lane(poly8x8_t vn, uint8x16_t vm) { return vluti2_laneq_p8(vn, vm, index); }
1655template <int index>[[gnu::always_inline]] nce poly16x8_t lookup_table_2bit_index_lane(poly16x4_t vn, uint8x8_t vm) { return vluti2_lane_p16(vn, vm, index); }
1656template <int index>[[gnu::always_inline]] nce poly16x8_t lookup_table_2bit_index_lane(poly16x4_t vn, uint8x16_t vm) { return vluti2_laneq_p16(vn, vm, index); }
1657template <int index>[[gnu::always_inline]] nce poly8x16_t lookup_table_2bit_index_lane(poly8x16_t vn, uint8x8_t vm) { return vluti2q_lane_p8(vn, vm, index); }
1658template <int index>[[gnu::always_inline]] nce poly8x16_t lookup_table_2bit_index_lane(poly8x16_t vn, uint8x16_t vm) { return vluti2q_laneq_p8(vn, vm, index); }
1659template <int index>[[gnu::always_inline]] nce bfloat16x8_t lookup_table_2bit_index_lane(bfloat16x4_t vn, uint8x8_t vm) { return vluti2_lane_bf16(vn, vm, index); }
1660template <int index>[[gnu::always_inline]] nce bfloat16x8_t lookup_table_2bit_index_lane(bfloat16x4_t vn, uint8x16_t vm) { return vluti2_laneq_bf16(vn, vm, index); }
1661template <int index>[[gnu::always_inline]] nce bfloat16x8_t lookup_table_2bit_index_lane(bfloat16x8_t vn, uint8x8_t vm) { return vluti2q_lane_bf16(vn, vm, index); }
1662template <int index>[[gnu::always_inline]] nce bfloat16x8_t lookup_table_2bit_index_lane(bfloat16x8_t vn, uint8x16_t vm) { return vluti2q_laneq_bf16(vn, vm, index); }
1663template <int index>[[gnu::always_inline]] nce poly16x8_t lookup_table_2bit_index_lane(poly16x8_t vn, uint8x8_t vm) { return vluti2q_lane_p16(vn, vm, index); }
1664template <int index>[[gnu::always_inline]] nce poly16x8_t lookup_table_2bit_index_lane(poly16x8_t vn, uint8x16_t vm) { return vluti2q_laneq_p16(vn, vm, index); }
1665template <int index>[[gnu::always_inline]] nce poly8x16_t lookup_table_4bit_index_lane(poly8x16_t vn, uint8x8_t vm) { return vluti4q_lane_p8(vn, vm, index); }
1666template <int index>[[gnu::always_inline]] nce poly8x16_t lookup_table_4bit_index_lane(poly8x16_t vn, uint8x16_t vm) { return vluti4q_laneq_p8(vn, vm, index); }
1667template <int index>[[gnu::always_inline]] nce uint16x8_t lookup_table_4bit_index_lane(uint16x8x2_t vn, uint8x8_t vm) { return vluti4q_lane_u16_x2(vn, vm, index); }
1668template <int index>[[gnu::always_inline]] nce uint16x8_t lookup_table_4bit_index_lane(uint16x8x2_t vn, uint8x16_t vm) { return vluti4q_laneq_u16_x2(vn, vm, index); }
1669template <int index>[[gnu::always_inline]] nce int16x8_t lookup_table_4bit_index_lane(int16x8x2_t vn, uint8x8_t vm) { return vluti4q_lane_s16_x2(vn, vm, index); }
1670template <int index>[[gnu::always_inline]] nce int16x8_t lookup_table_4bit_index_lane(int16x8x2_t vn, uint8x16_t vm) { return vluti4q_laneq_s16_x2(vn, vm, index); }
1671template <int index>[[gnu::always_inline]] nce float16x8_t lookup_table_4bit_index_lane(float16x8x2_t vn, uint8x8_t vm) { return vluti4q_lane_f16_x2(vn, vm, index); }
1672template <int index>[[gnu::always_inline]] nce float16x8_t lookup_table_4bit_index_lane(float16x8x2_t vn, uint8x16_t vm) { return vluti4q_laneq_f16_x2(vn, vm, index); }
1673template <int index>[[gnu::always_inline]] nce bfloat16x8_t lookup_table_4bit_index_lane(bfloat16x8x2_t vn, uint8x8_t vm) { return vluti4q_lane_bf16_x2(vn, vm, index); }
1674template <int index>[[gnu::always_inline]] nce bfloat16x8_t lookup_table_4bit_index_lane(bfloat16x8x2_t vn, uint8x16_t vm) { return vluti4q_laneq_bf16_x2(vn, vm, index); }
1675template <int index>[[gnu::always_inline]] nce poly16x8_t lookup_table_4bit_index_lane(poly16x8x2_t vn, uint8x8_t vm) { return vluti4q_lane_p16_x2(vn, vm, index); }
1676template <int index>[[gnu::always_inline]] nce poly16x8_t lookup_table_4bit_index_lane(poly16x8x2_t vn, uint8x16_t vm) { return vluti4q_laneq_p16_x2(vn, vm, index); }
1677#endif
1678[[gnu::always_inline]] nce int16_t multiply_double_add_round_saturate_high(int16_t a, int16_t b, int16_t c) { return vqrdmlahh_s16(a, b, c); }
1679[[gnu::always_inline]] nce int32_t multiply_double_add_round_saturate_high(int32_t a, int32_t b, int32_t c) { return vqrdmlahs_s32(a, b, c); }
1680[[gnu::always_inline]] nce int16_t multiply_double_subtract_round_saturate_high(int16_t a, int16_t b, int16_t c) { return vqrdmlshh_s16(a, b, c); }
1681[[gnu::always_inline]] nce int32_t multiply_double_subtract_round_saturate_high(int32_t a, int32_t b, int32_t c) { return vqrdmlshs_s32(a, b, c); }
1682template <int lane>[[gnu::always_inline]] nce int16_t multiply_double_add_round_saturate_high_lane(int16_t a, int16_t b, int16x4_t v) { return vqrdmlahh_lane_s16(a, b, v, lane); }
1683template <int lane>[[gnu::always_inline]] nce int16_t multiply_double_add_round_saturate_high_lane(int16_t a, int16_t b, int16x8_t v) { return vqrdmlahh_laneq_s16(a, b, v, lane); }
1684template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_add_round_saturate_high_lane(int32_t a, int32_t b, int32x2_t v) { return vqrdmlahs_lane_s32(a, b, v, lane); }
1685template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_add_round_saturate_high_lane(int32_t a, int32_t b, int32x4_t v) { return vqrdmlahs_laneq_s32(a, b, v, lane); }
1686template <int lane>[[gnu::always_inline]] nce int16_t multiply_double_subtract_round_saturate_high_lane(int16_t a, int16_t b, int16x4_t v) { return vqrdmlshh_lane_s16(a, b, v, lane); }
1687template <int lane>[[gnu::always_inline]] nce int16_t multiply_double_subtract_round_saturate_high_lane(int16_t a, int16_t b, int16x8_t v) { return vqrdmlshh_laneq_s16(a, b, v, lane); }
1688template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_subtract_round_saturate_high_lane(int32_t a, int32_t b, int32x2_t v) { return vqrdmlshs_lane_s32(a, b, v, lane); }
1689template <int lane>[[gnu::always_inline]] nce int32_t multiply_double_subtract_round_saturate_high_lane(int32_t a, int32_t b, int32x4_t v) { return vqrdmlshs_laneq_s32(a, b, v, lane); }
1690#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1691[[gnu::always_inline]] nce float16_t absolute_difference(float16_t a, float16_t b) { return vabdh_f16(a, b); }
1692[[gnu::always_inline]] nce float16_t reciprocal_estimate(float16_t a) { return vrecpeh_f16(a); }
1693[[gnu::always_inline]] nce float16_t reciprocal_exponent(float16_t a) { return vrecpxh_f16(a); }
1694[[gnu::always_inline]] nce float16_t reciprocal_sqrt_estimate(float16_t a) { return vrsqrteh_f16(a); }
1695[[gnu::always_inline]] nce float16_t reciprocal_sqrt_step(float16_t a, float16_t b) { return vrsqrtsh_f16(a, b); }
1696[[gnu::always_inline]] nce float16_t reciprocal_step(float16_t a, float16_t b) { return vrecpsh_f16(a, b); }
1697[[gnu::always_inline]] nce float16_t max(float16_t a, float16_t b) { return vmaxh_f16(a, b); }
1698[[gnu::always_inline]] nce float16_t min(float16_t a, float16_t b) { return vminh_f16(a, b); }
1699[[gnu::always_inline]] nce float16_t multiply_extended(float16_t a, float16_t b) { return vmulxh_f16(a, b); }
1700[[gnu::always_inline]] nce uint16_t equal_to_zero(float16_t a) { return vceqzh_f16(a); }
1701[[gnu::always_inline]] nce uint16_t greater_than_or_equal_to_zero(float16_t a) { return vcgezh_f16(a); }
1702[[gnu::always_inline]] nce uint16_t greater_than_zero(float16_t a) { return vcgtzh_f16(a); }
1703[[gnu::always_inline]] nce uint16_t less_than_or_equal_to_zero(float16_t a) { return vclezh_f16(a); }
1704[[gnu::always_inline]] nce uint16_t less_than_zero(float16_t a) { return vcltzh_f16(a); }
1705[[gnu::always_inline]] nce uint16_t absolute_greater_than_or_equal(float16_t a, float16_t b) { return vcageh_f16(a, b); }
1706[[gnu::always_inline]] nce uint16_t absolute_greater_than(float16_t a, float16_t b) { return vcagth_f16(a, b); }
1707[[gnu::always_inline]] nce uint16_t absolute_less_than_or_equal(float16_t a, float16_t b) { return vcaleh_f16(a, b); }
1708[[gnu::always_inline]] nce uint16_t absolute_less_than(float16_t a, float16_t b) { return vcalth_f16(a, b); }
1709[[gnu::always_inline]] nce uint16_t equal(float16_t a, float16_t b) { return vceqh_f16(a, b); }
1710[[gnu::always_inline]] nce uint16_t greater_than_or_equal(float16_t a, float16_t b) { return vcgeh_f16(a, b); }
1711[[gnu::always_inline]] nce uint16_t greater_than(float16_t a, float16_t b) { return vcgth_f16(a, b); }
1712[[gnu::always_inline]] nce uint16_t less_than_or_equal(float16_t a, float16_t b) { return vcleh_f16(a, b); }
1713[[gnu::always_inline]] nce uint16_t less_than(float16_t a, float16_t b) { return vclth_f16(a, b); }
1714template <> [[gnu::always_inline]] nce float16_t convert(int16_t a) { return vcvth_f16_s16(a); }
1715template <> [[gnu::always_inline]] nce float16_t convert(int64_t a) { return vcvth_f16_s64(a); }
1716template <> [[gnu::always_inline]] nce float16_t convert(uint16_t a) { return vcvth_f16_u16(a); }
1717template <> [[gnu::always_inline]] nce float16_t convert(uint64_t a) { return vcvth_f16_u64(a); }
1718template <> [[gnu::always_inline]] nce int16_t convert(float16_t a) { return vcvth_s16_f16(a); }
1719template <> [[gnu::always_inline]] nce int64_t convert(float16_t a) { return vcvth_s64_f16(a); }
1720template <> [[gnu::always_inline]] nce uint16_t convert(float16_t a) { return vcvth_u16_f16(a); }
1721template <> [[gnu::always_inline]] nce uint64_t convert(float16_t a) { return vcvth_u64_f16(a); }
1722template <> [[gnu::always_inline]] nce int16_t convert_round_to_nearest_with_ties_away_from_zero(float16_t a) { return vcvtah_s16_f16(a); }
1723template <> [[gnu::always_inline]] nce int64_t convert_round_to_nearest_with_ties_away_from_zero(float16_t a) { return vcvtah_s64_f16(a); }
1724template <> [[gnu::always_inline]] nce uint16_t convert_round_to_nearest_with_ties_away_from_zero(float16_t a) { return vcvtah_u16_f16(a); }
1725template <> [[gnu::always_inline]] nce uint64_t convert_round_to_nearest_with_ties_away_from_zero(float16_t a) { return vcvtah_u64_f16(a); }
1726template <> [[gnu::always_inline]] nce int16_t convert_round_toward_negative_infinity(float16_t a) { return vcvtmh_s16_f16(a); }
1727template <> [[gnu::always_inline]] nce int64_t convert_round_toward_negative_infinity(float16_t a) { return vcvtmh_s64_f16(a); }
1728template <> [[gnu::always_inline]] nce uint16_t convert_round_toward_negative_infinity(float16_t a) { return vcvtmh_u16_f16(a); }
1729template <> [[gnu::always_inline]] nce uint64_t convert_round_toward_negative_infinity(float16_t a) { return vcvtmh_u64_f16(a); }
1730template <> [[gnu::always_inline]] nce int16_t convert_round_to_nearest_with_ties_to_even(float16_t a) { return vcvtnh_s16_f16(a); }
1731template <> [[gnu::always_inline]] nce int64_t convert_round_to_nearest_with_ties_to_even(float16_t a) { return vcvtnh_s64_f16(a); }
1732template <> [[gnu::always_inline]] nce uint16_t convert_round_to_nearest_with_ties_to_even(float16_t a) { return vcvtnh_u16_f16(a); }
1733template <> [[gnu::always_inline]] nce uint64_t convert_round_to_nearest_with_ties_to_even(float16_t a) { return vcvtnh_u64_f16(a); }
1734template <> [[gnu::always_inline]] nce int16_t convert_round_toward_positive_infinity(float16_t a) { return vcvtph_s16_f16(a); }
1735template <> [[gnu::always_inline]] nce int64_t convert_round_toward_positive_infinity(float16_t a) { return vcvtph_s64_f16(a); }
1736template <> [[gnu::always_inline]] nce uint16_t convert_round_toward_positive_infinity(float16_t a) { return vcvtph_u16_f16(a); }
1737template <> [[gnu::always_inline]] nce uint64_t convert_round_toward_positive_infinity(float16_t a) { return vcvtph_u64_f16(a); }
1738template <int n>[[gnu::always_inline]] nce float16_t convert(int16_t a) { return vcvth_n_f16_s16(a, n); }
1739template <int n>[[gnu::always_inline]] nce float16_t convert(int64_t a) { return vcvth_n_f16_s64(a, n); }
1740template <int n>[[gnu::always_inline]] nce float16_t convert(uint16_t a) { return vcvth_n_f16_u16(a, n); }
1741template <int n>[[gnu::always_inline]] nce float16_t convert(uint64_t a) { return vcvth_n_f16_u64(a, n); }
1742template <int n>[[gnu::always_inline]] nce int16_t convert(float16_t a) { return vcvth_n_s16_f16(a, n); }
1743template <int n>[[gnu::always_inline]] nce int64_t convert(float16_t a) { return vcvth_n_s64_f16(a, n); }
1744template <int n>[[gnu::always_inline]] nce uint16_t convert(float16_t a) { return vcvth_n_u16_f16(a, n); }
1745template <int n>[[gnu::always_inline]] nce uint64_t convert(float16_t a) { return vcvth_n_u64_f16(a, n); }
1746template <int lane>[[gnu::always_inline]] nce float16_t multiply_lane(float16_t a, float16x4_t v) { return vmulh_lane_f16(a, v, lane); }
1747template <int lane>[[gnu::always_inline]] nce float16_t multiply_lane(float16_t a, float16x8_t v) { return vmulh_laneq_f16(a, v, lane); }
1748template <int lane>[[gnu::always_inline]] nce float16_t multiply_extended_lane(float16_t a, float16x4_t v) { return vmulxh_lane_f16(a, v, lane); }
1749template <int lane>[[gnu::always_inline]] nce float16_t multiply_extended_lane(float16_t a, float16x8_t v) { return vmulxh_laneq_f16(a, v, lane); }
1750template <int lane>[[gnu::always_inline]] nce float16_t multiply_add_fused_lane(float16_t a, float16_t b, float16x4_t v) { return vfmah_lane_f16(a, b, v, lane); }
1751template <int lane>[[gnu::always_inline]] nce float16_t multiply_add_fused_lane(float16_t a, float16_t b, float16x8_t v) { return vfmah_laneq_f16(a, b, v, lane); }
1752template <int lane>[[gnu::always_inline]] nce float16_t multiply_subtract_fused_lane(float16_t a, float16_t b, float16x4_t v) { return vfmsh_lane_f16(a, b, v, lane); }
1753template <int lane>[[gnu::always_inline]] nce float16_t multiply_subtract_fused_lane(float16_t a, float16_t b, float16x8_t v) { return vfmsh_laneq_f16(a, b, v, lane); }
1754#endif
1755
1756template <int lane1, int lane2>[[gnu::always_inline]] nce bfloat16x4_t copy_lane(bfloat16x4_t a, bfloat16x4_t b) { return vcopy_lane_bf16(a, lane1, b, lane2); }
1757template <int lane1, int lane2>[[gnu::always_inline]] nce bfloat16x8_t copy_lane(bfloat16x8_t a, bfloat16x4_t b) { return vcopyq_lane_bf16(a, lane1, b, lane2); }
1758template <int lane1, int lane2>[[gnu::always_inline]] nce bfloat16x4_t copy_lane(bfloat16x4_t a, bfloat16x8_t b) { return vcopy_laneq_bf16(a, lane1, b, lane2); }
1759template <int lane1, int lane2>[[gnu::always_inline]] nce bfloat16x8_t copy_lane(bfloat16x8_t a, bfloat16x8_t b) { return vcopyq_laneq_bf16(a, lane1, b, lane2); }
1760template <> [[gnu::always_inline]] nce float64x1_t reinterpret(bfloat16x4_t a) { return vreinterpret_f64_bf16(a); }
1761template <> [[gnu::always_inline]] nce float64x2_t reinterpret(bfloat16x8_t a) { return vreinterpretq_f64_bf16(a); }
1762
1763}
1764#endif // __cplusplus
1765#undef nce