4#include "arm_simd/shared/vfpv4_float.hpp"
14template <
typename T> nce T reinterpret(uint8x8_t a);
15template <
typename T> nce T reinterpret(uint8x16_t a);
16template <
typename T> nce T reinterpret(int8x8_t a);
17template <
typename T> nce T reinterpret(int8x16_t a);
18template <
typename T> nce T reinterpret(uint16x4_t a);
19template <
typename T> nce T reinterpret(uint16x8_t a);
20template <
typename T> nce T reinterpret(int16x4_t a);
21template <
typename T> nce T reinterpret(int16x8_t a);
22template <
typename T> nce T reinterpret(int32x2_t a);
23template <
typename T> nce T reinterpret(int32x4_t a);
24template <
typename T> nce T reinterpret(uint64x1_t a);
25template <
typename T> nce T reinterpret(uint32x2_t a);
26template <
typename T> nce T reinterpret(uint32x4_t a);
27template <
typename T> nce T reinterpret(float16x4_t a);
28template <
typename T> nce T reinterpret(float32x2_t a);
29template <
typename T> nce T reinterpret(float32x4_t a);
30template <
typename T> nce T reinterpret(poly8x8_t a);
31template <
typename T> nce T reinterpret(poly16x4_t a);
32template <
typename T> nce T reinterpret(int64x1_t a);
33template <
typename T> nce T reinterpret(poly8x16_t a);
34template <
typename T> nce T reinterpret(poly16x8_t a);
35template <
typename T> nce T reinterpret(int64x2_t a);
36template <
typename T> nce T load1(float16_t
const *ptr);
37template <
typename T> nce T load1_duplicate(float16_t
const *ptr);
38template <
typename T> nce T load2(float16_t
const *ptr);
39template <
typename T> nce T load3(float16_t
const *ptr);
40template <
typename T> nce T load4(float16_t
const *ptr);
41template <
typename T> nce T load2_duplicate(float16_t
const *ptr);
42template <
typename T> nce T load3_duplicate(float16_t
const *ptr);
43template <
typename T> nce T load4_duplicate(float16_t
const *ptr);
44template <
typename T> nce T load1_x2(float16_t
const *ptr);
45template <
typename T> nce T load1_x3(float16_t
const *ptr);
46template <
typename T> nce T load1_x4(float16_t
const *ptr);
47template <
typename T> nce T store1(float16_t *ptr, float16x4_t val);
48template <
typename T> nce T store1(float16_t *ptr, float16x8_t val);
49template <
typename T> nce T store2(float16_t *ptr, float16x4x2_t val);
50template <
typename T> nce T store2(float16_t *ptr, float16x8x2_t val);
51template <
typename T> nce T store3(float16_t *ptr, float16x4x3_t val);
52template <
typename T> nce T store3(float16_t *ptr, float16x8x3_t val);
53template <
typename T> nce T store4(float16_t *ptr, float16x4x4_t val);
54template <
typename T> nce T store4(float16_t *ptr, float16x8x4_t val);
55template <
typename T> nce T duplicate(float16_t value);
56template <
typename T> nce T duplicate(float16x4_t vec);
57template <
typename T> nce T move(float16_t value);
58template <
typename T> nce T convert(float16x4_t a) ;
59template <
typename T> nce T convert(float32x4_t a);
60template <> [[gnu::always_inline]] nce float16x4_t reinterpret(uint8x8_t a) {
return vreinterpret_f16_u8(a); }
61template <> [[gnu::always_inline]] nce float16x4_t reinterpret(int8x8_t a) {
return vreinterpret_f16_s8(a); }
62template <> [[gnu::always_inline]] nce float16x4_t reinterpret(uint16x4_t a) {
return vreinterpret_f16_u16(a); }
63[[gnu::always_inline]] nce float16x4_t bitwise_select(uint16x4_t a, float16x4_t b, float16x4_t c) {
return vbsl_f16(a, b, c); }
64[[gnu::always_inline]] nce float16x8_t bitwise_select(uint16x8_t a, float16x8_t b, float16x8_t c) {
return vbslq_f16(a, b, c); }
65template <> [[gnu::always_inline]] nce float16x4_t reinterpret(int16x4_t a) {
return vreinterpret_f16_s16(a); }
66template <> [[gnu::always_inline]] nce float16x4_t reinterpret(int32x2_t a) {
return vreinterpret_f16_s32(a); }
67template <> [[gnu::always_inline]] nce float16x4_t reinterpret(uint64x1_t a) {
return vreinterpret_f16_u64(a); }
68template <> [[gnu::always_inline]] nce float64x2_t reinterpret(uint64x2_t a) {
return vreinterpretq_f64_u64(a); }
69template <> [[gnu::always_inline]] nce float16x4_t reinterpret(uint32x2_t a) {
return vreinterpret_f16_u32(a); }
70template <> [[gnu::always_inline]] nce float32x4_t convert(float16x4_t a) {
return vcvt_f32_f16(a); }
71template <> [[gnu::always_inline]] nce int8x8_t reinterpret(float16x4_t a) {
return vreinterpret_s8_f16(a); }
72template <> [[gnu::always_inline]] nce int16x4_t reinterpret(float16x4_t a) {
return vreinterpret_s16_f16(a); }
73template <> [[gnu::always_inline]] nce int32x2_t reinterpret(float16x4_t a) {
return vreinterpret_s32_f16(a); }
74template <> [[gnu::always_inline]] nce float32x2_t reinterpret(float16x4_t a) {
return vreinterpret_f32_f16(a); }
75template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(float16x4_t a) {
return vreinterpret_u8_f16(a); }
76template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(float16x4_t a) {
return vreinterpret_u16_f16(a); }
77template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(float16x4_t a) {
return vreinterpret_u32_f16(a); }
78template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(float16x4_t a) {
return vreinterpret_p8_f16(a); }
79template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(float16x4_t a) {
return vreinterpret_p16_f16(a); }
80template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(float16x4_t a) {
return vreinterpret_u64_f16(a); }
81template <> [[gnu::always_inline]] nce int64x1_t reinterpret(float16x4_t a) {
return vreinterpret_s64_f16(a); }
82[[gnu::always_inline]] nce float16x8_t combine(float16x4_t low, float16x4_t high) {
return vcombine_f16(low, high); }
83template <
int lane>[[gnu::always_inline]] nce float16_t get_lane(float16x4_t v) {
return vget_lane_f16(v, lane); }
84[[gnu::always_inline]] nce float16x4x2_t zip(float16x4_t a, float16x4_t b) {
return vzip_f16(a, b); }
85[[gnu::always_inline]] nce float16x4x2_t unzip(float16x4_t a, float16x4_t b) {
return vuzp_f16(a, b); }
86[[gnu::always_inline]] nce float16x4x2_t transpose(float16x4_t a, float16x4_t b) {
return vtrn_f16(a, b); }
87template <
int lane>[[gnu::always_inline]] nce float16x4_t duplicate_lane(float16x4_t vec) {
return vdup_lane_f16(vec, lane); }
88template <
int lane>[[gnu::always_inline]] nce float16x8_t duplicate_lane(float16x4_t vec) {
return vdupq_lane_f16(vec, lane); }
89template <
int n>[[gnu::always_inline]] nce float16x4_t extract(float16x4_t a, float16x4_t b) {
return vext_f16(a, b, n); }
90[[gnu::always_inline]] nce float16x4_t reverse_64bit(float16x4_t vec) {
return vrev64_f16(vec); }
91template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(float16x8_t a) {
return vreinterpretq_p8_f16(a); }
92template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(float16x8_t a) {
return vreinterpretq_p16_f16(a); }
93[[gnu::always_inline]] nce float16x4_t get_high(float16x8_t a) {
return vget_high_f16(a); }
94[[gnu::always_inline]] nce float16x4_t get_low(float16x8_t a) {
return vget_low_f16(a); }
95[[gnu::always_inline]] nce float16x8x2_t zip(float16x8_t a, float16x8_t b) {
return vzipq_f16(a, b); }
96[[gnu::always_inline]] nce float16x8x2_t unzip(float16x8_t a, float16x8_t b) {
return vuzpq_f16(a, b); }
97[[gnu::always_inline]] nce float16x8x2_t transpose(float16x8_t a, float16x8_t b) {
return vtrnq_f16(a, b); }
98template <
int n>[[gnu::always_inline]] nce float16x8_t extract(float16x8_t a, float16x8_t b) {
return vextq_f16(a, b, n); }
99[[gnu::always_inline]] nce float16x8_t reverse_64bit(float16x8_t vec) {
return vrev64q_f16(vec); }
100template <> [[gnu::always_inline]] nce float16x4_t reinterpret(float32x2_t a) {
return vreinterpret_f16_f32(a); }
101template <> [[gnu::always_inline]] nce float16x4_t convert(float32x4_t a) {
return vcvt_f16_f32(a); }
102template <> [[gnu::always_inline]] nce float16x4_t reinterpret(poly8x8_t a) {
return vreinterpret_f16_p8(a); }
103template <> [[gnu::always_inline]] nce float16x4_t reinterpret(poly16x4_t a) {
return vreinterpret_f16_p16(a); }
104template <> [[gnu::always_inline]] nce float16x4_t reinterpret(int64x1_t a) {
return vreinterpret_f16_s64(a); }
105template <> [[gnu::always_inline]] nce float16x8_t reinterpret(poly8x16_t a) {
return vreinterpretq_f16_p8(a); }
106template <> [[gnu::always_inline]] nce float16x8_t reinterpret(poly16x8_t a) {
return vreinterpretq_f16_p16(a); }
107template <> [[gnu::always_inline]] nce float16x4_t create(uint64_t a) {
return vcreate_f16(a); }
108template <
int lane>[[gnu::always_inline]] nce float16x4_t set_lane(float16_t a, float16x4_t v) {
return vset_lane_f16(a, v, lane); }
109template <> [[gnu::always_inline]]
inline float16x4_t load1(float16_t
const *ptr) {
return vld1_f16(ptr); }
110template <> [[gnu::always_inline]]
inline float16x8_t load1(float16_t
const *ptr) {
return vld1q_f16(ptr); }
111template <
int lane>[[gnu::always_inline]] nce float16x4_t load1_lane(float16_t
const *ptr, float16x4_t src) {
return vld1_lane_f16(ptr, src, lane); }
112template <
int lane>[[gnu::always_inline]] nce float16x8_t load1_lane(float16_t
const *ptr, float16x8_t src) {
return vld1q_lane_f16(ptr, src, lane); }
113template <> [[gnu::always_inline]]
inline float16x4_t load1_duplicate(float16_t
const *ptr) {
return vld1_dup_f16(ptr); }
114template <> [[gnu::always_inline]]
inline float16x8_t load1_duplicate(float16_t
const *ptr) {
return vld1q_dup_f16(ptr); }
115template <> [[gnu::always_inline]]
inline float16x4x2_t load2(float16_t
const *ptr) {
return vld2_f16(ptr); }
116template <> [[gnu::always_inline]]
inline float16x8x2_t load2(float16_t
const *ptr) {
return vld2q_f16(ptr); }
117template <> [[gnu::always_inline]]
inline float16x4x3_t load3(float16_t
const *ptr) {
return vld3_f16(ptr); }
118template <> [[gnu::always_inline]]
inline float16x8x3_t load3(float16_t
const *ptr) {
return vld3q_f16(ptr); }
119template <> [[gnu::always_inline]]
inline float16x4x4_t load4(float16_t
const *ptr) {
return vld4_f16(ptr); }
120template <> [[gnu::always_inline]]
inline float16x8x4_t load4(float16_t
const *ptr) {
return vld4q_f16(ptr); }
121template <> [[gnu::always_inline]]
inline float16x4x2_t load2_duplicate(float16_t
const *ptr) {
return vld2_dup_f16(ptr); }
122template <> [[gnu::always_inline]]
inline float16x8x2_t load2_duplicate(float16_t
const *ptr) {
return vld2q_dup_f16(ptr); }
123template <> [[gnu::always_inline]]
inline float16x4x3_t load3_duplicate(float16_t
const *ptr) {
return vld3_dup_f16(ptr); }
124template <> [[gnu::always_inline]]
inline float16x8x3_t load3_duplicate(float16_t
const *ptr) {
return vld3q_dup_f16(ptr); }
125template <> [[gnu::always_inline]]
inline float16x4x4_t load4_duplicate(float16_t
const *ptr) {
return vld4_dup_f16(ptr); }
126template <> [[gnu::always_inline]]
inline float16x8x4_t load4_duplicate(float16_t
const *ptr) {
return vld4q_dup_f16(ptr); }
127template <
int lane>[[gnu::always_inline]] nce float16x4x2_t load2_lane(float16_t
const *ptr, float16x4x2_t src) {
return vld2_lane_f16(ptr, src, lane); }
128template <
int lane>[[gnu::always_inline]] nce float16x8x2_t load2_lane(float16_t
const *ptr, float16x8x2_t src) {
return vld2q_lane_f16(ptr, src, lane); }
129template <
int lane>[[gnu::always_inline]] nce float16x4x3_t load3_lane(float16_t
const *ptr, float16x4x3_t src) {
return vld3_lane_f16(ptr, src, lane); }
130template <
int lane>[[gnu::always_inline]] nce float16x8x3_t load3_lane(float16_t
const *ptr, float16x8x3_t src) {
return vld3q_lane_f16(ptr, src, lane); }
131template <
int lane>[[gnu::always_inline]] nce float16x4x4_t load4_lane(float16_t
const *ptr, float16x4x4_t src) {
return vld4_lane_f16(ptr, src, lane); }
132template <
int lane>[[gnu::always_inline]] nce float16x8x4_t load4_lane(float16_t
const *ptr, float16x8x4_t src) {
return vld4q_lane_f16(ptr, src, lane); }
133template <> [[gnu::always_inline]]
inline float16x4x2_t load1_x2(float16_t
const *ptr) {
return vld1_f16_x2(ptr); }
134template <> [[gnu::always_inline]]
inline float16x8x2_t load1_x2(float16_t
const *ptr) {
return vld1q_f16_x2(ptr); }
135template <> [[gnu::always_inline]]
inline float16x4x3_t load1_x3(float16_t
const *ptr) {
return vld1_f16_x3(ptr); }
136template <> [[gnu::always_inline]]
inline float16x8x3_t load1_x3(float16_t
const *ptr) {
return vld1q_f16_x3(ptr); }
137template <> [[gnu::always_inline]]
inline float16x4x4_t load1_x4(float16_t
const *ptr) {
return vld1_f16_x4(ptr); }
138template <> [[gnu::always_inline]]
inline float16x8x4_t load1_x4(float16_t
const *ptr) {
return vld1q_f16_x4(ptr); }
139template <> [[gnu::always_inline]]
inline void store1(float16_t *ptr, float16x4_t val) {
return vst1_f16(ptr, val); }
140template <> [[gnu::always_inline]]
inline void store1(float16_t *ptr, float16x8_t val) {
return vst1q_f16(ptr, val); }
141template <
int lane>[[gnu::always_inline]] nce
void store1_lane(float16_t *ptr, float16x4_t val) {
return vst1_lane_f16(ptr, val, lane); }
142template <
int lane>[[gnu::always_inline]] nce
void store1_lane(float16_t *ptr, float16x8_t val) {
return vst1q_lane_f16(ptr, val, lane); }
143template <> [[gnu::always_inline]]
inline void store2(float16_t *ptr, float16x4x2_t val) {
return vst2_f16(ptr, val); }
144template <> [[gnu::always_inline]]
inline void store2(float16_t *ptr, float16x8x2_t val) {
return vst2q_f16(ptr, val); }
145template <> [[gnu::always_inline]]
inline void store3(float16_t *ptr, float16x4x3_t val) {
return vst3_f16(ptr, val); }
146template <> [[gnu::always_inline]]
inline void store3(float16_t *ptr, float16x8x3_t val) {
return vst3q_f16(ptr, val); }
147template <> [[gnu::always_inline]]
inline void store4(float16_t *ptr, float16x4x4_t val) {
return vst4_f16(ptr, val); }
148template <> [[gnu::always_inline]]
inline void store4(float16_t *ptr, float16x8x4_t val) {
return vst4q_f16(ptr, val); }
149template <
int lane>[[gnu::always_inline]] nce
void store2_lane(float16_t *ptr, float16x4x2_t val) {
return vst2_lane_f16(ptr, val, lane); }
150template <
int lane>[[gnu::always_inline]] nce
void store2_lane(float16_t *ptr, float16x8x2_t val) {
return vst2q_lane_f16(ptr, val, lane); }
151template <
int lane>[[gnu::always_inline]] nce
void store3_lane(float16_t *ptr, float16x4x3_t val) {
return vst3_lane_f16(ptr, val, lane); }
152template <
int lane>[[gnu::always_inline]] nce
void store3_lane(float16_t *ptr, float16x8x3_t val) {
return vst3q_lane_f16(ptr, val, lane); }
153template <
int lane>[[gnu::always_inline]] nce
void store4_lane(float16_t *ptr, float16x4x4_t val) {
return vst4_lane_f16(ptr, val, lane); }
154template <
int lane>[[gnu::always_inline]] nce
void store4_lane(float16_t *ptr, float16x8x4_t val) {
return vst4q_lane_f16(ptr, val, lane); }
155[[gnu::always_inline]]
inline void store1_x2(float16_t *ptr, float16x4x2_t val) {
return vst1_f16_x2(ptr, val); }
156[[gnu::always_inline]]
inline void store1_x2(float16_t *ptr, float16x8x2_t val) {
return vst1q_f16_x2(ptr, val); }
157[[gnu::always_inline]]
inline void store1_x3(float16_t *ptr, float16x4x3_t val) {
return vst1_f16_x3(ptr, val); }
158[[gnu::always_inline]]
inline void store1_x3(float16_t *ptr, float16x8x3_t val) {
return vst1q_f16_x3(ptr, val); }
159[[gnu::always_inline]]
inline void store1_x4(float16_t *ptr, float16x4x4_t val) {
return vst1_f16_x4(ptr, val); }
160[[gnu::always_inline]]
inline void store1_x4(float16_t *ptr, float16x8x4_t val) {
return vst1q_f16_x4(ptr, val); }
161template <> [[gnu::always_inline]] nce float16x4_t duplicate(float16_t value) {
return vdup_n_f16(value); }
162template <> [[gnu::always_inline]] nce float16x8_t duplicate(float16_t value) {
return vdupq_n_f16(value); }
163template <> [[gnu::always_inline]] nce float16x4_t move(float16_t value) {
return vmov_n_f16(value); }
164template <> [[gnu::always_inline]] nce float16x8_t move(float16_t value) {
return vmovq_n_f16(value); }
165[[gnu::always_inline]] nce float32x2_t multiply_add_fused(float32x2_t a, float32x2_t b, float32x2_t c) {
return vfma_f32(a, b, c);}
166[[gnu::always_inline]] nce float32x2_t multiply_subtract_fused(float32x2_t a, float32x2_t b, float32x2_t c) {
return vfms_f32(a, b, c); }
167[[gnu::always_inline]] nce float32x4_t multiply_add_fused(float32x4_t a, float32x4_t b, float32x4_t c) {
return vfmaq_f32(a, b, c);}
168[[gnu::always_inline]] nce float32x4_t multiply_subtract_fused(float32x4_t a, float32x4_t b, float32x4_t c) {
return vfmsq_f32(a, b, c); }
169[[gnu::always_inline]] nce float32x2_t multiply_add_fused(float32x2_t a, float32x2_t b, float32_t c) {
return vfma_n_f32(a, b, c);}
170[[gnu::always_inline]] nce float32x2_t multiply_subtract_fused(float32x2_t a, float32x2_t b, float32_t c) {
return vfms_n_f32(a, b, c); }
171[[gnu::always_inline]] nce float32x4_t multiply_add_fused(float32x4_t a, float32x4_t b, float32_t c) {
return vfmaq_n_f32(a, b, c);}
172[[gnu::always_inline]] nce float32x4_t multiply_subtract_fused(float32x4_t a, float32x4_t b, float32_t c) {
return vfmsq_n_f32(a, b, c); }