Argon 0.1.0
Loading...
Searching...
No Matches
vfpv4.hpp
1#pragma once
2#include "vfpv3.hpp"
3
4#include "arm_simd/shared/vfpv4_float.hpp"
5
6#ifdef __cplusplus
7#ifdef __clang__
8#define nce constexpr
9#else
10#define nce inline
11#endif
12namespace neon {
13// clang-format off
14template <typename T> nce T reinterpret(uint8x8_t a);
15template <typename T> nce T reinterpret(uint8x16_t a);
16template <typename T> nce T reinterpret(int8x8_t a);
17template <typename T> nce T reinterpret(int8x16_t a);
18template <typename T> nce T reinterpret(uint16x4_t a);
19template <typename T> nce T reinterpret(uint16x8_t a);
20template <typename T> nce T reinterpret(int16x4_t a);
21template <typename T> nce T reinterpret(int16x8_t a);
22template <typename T> nce T reinterpret(int32x2_t a);
23template <typename T> nce T reinterpret(int32x4_t a);
24template <typename T> nce T reinterpret(uint64x1_t a);
25template <typename T> nce T reinterpret(uint32x2_t a);
26template <typename T> nce T reinterpret(uint32x4_t a);
27template <typename T> nce T reinterpret(float16x4_t a);
28template <typename T> nce T reinterpret(float32x2_t a);
29template <typename T> nce T reinterpret(float32x4_t a);
30template <typename T> nce T reinterpret(poly8x8_t a);
31template <typename T> nce T reinterpret(poly16x4_t a);
32template <typename T> nce T reinterpret(int64x1_t a);
33template <typename T> nce T reinterpret(poly8x16_t a);
34template <typename T> nce T reinterpret(poly16x8_t a);
35template <typename T> nce T reinterpret(int64x2_t a);
36template <typename T> nce T load1(float16_t const *ptr);
37template <typename T> nce T load1_duplicate(float16_t const *ptr);
38template <typename T> nce T load2(float16_t const *ptr);
39template <typename T> nce T load3(float16_t const *ptr);
40template <typename T> nce T load4(float16_t const *ptr);
41template <typename T> nce T load2_duplicate(float16_t const *ptr);
42template <typename T> nce T load3_duplicate(float16_t const *ptr);
43template <typename T> nce T load4_duplicate(float16_t const *ptr);
44template <typename T> nce T load1_x2(float16_t const *ptr);
45template <typename T> nce T load1_x3(float16_t const *ptr);
46template <typename T> nce T load1_x4(float16_t const *ptr);
47template <typename T> nce T store1(float16_t *ptr, float16x4_t val);
48template <typename T> nce T store1(float16_t *ptr, float16x8_t val);
49template <typename T> nce T store2(float16_t *ptr, float16x4x2_t val);
50template <typename T> nce T store2(float16_t *ptr, float16x8x2_t val);
51template <typename T> nce T store3(float16_t *ptr, float16x4x3_t val);
52template <typename T> nce T store3(float16_t *ptr, float16x8x3_t val);
53template <typename T> nce T store4(float16_t *ptr, float16x4x4_t val);
54template <typename T> nce T store4(float16_t *ptr, float16x8x4_t val);
55template <typename T> nce T duplicate(float16_t value);
56template <typename T> nce T duplicate(float16x4_t vec);
57template <typename T> nce T move(float16_t value);
58template <typename T> nce T convert(float16x4_t a) ;
59template <typename T> nce T convert(float32x4_t a);
60template <> [[gnu::always_inline]] nce float16x4_t reinterpret(uint8x8_t a) { return vreinterpret_f16_u8(a); }
61template <> [[gnu::always_inline]] nce float16x4_t reinterpret(int8x8_t a) { return vreinterpret_f16_s8(a); }
62template <> [[gnu::always_inline]] nce float16x4_t reinterpret(uint16x4_t a) { return vreinterpret_f16_u16(a); }
63[[gnu::always_inline]] nce float16x4_t bitwise_select(uint16x4_t a, float16x4_t b, float16x4_t c) { return vbsl_f16(a, b, c); }
64[[gnu::always_inline]] nce float16x8_t bitwise_select(uint16x8_t a, float16x8_t b, float16x8_t c) { return vbslq_f16(a, b, c); }
65template <> [[gnu::always_inline]] nce float16x4_t reinterpret(int16x4_t a) { return vreinterpret_f16_s16(a); }
66template <> [[gnu::always_inline]] nce float16x4_t reinterpret(int32x2_t a) { return vreinterpret_f16_s32(a); }
67template <> [[gnu::always_inline]] nce float16x4_t reinterpret(uint64x1_t a) { return vreinterpret_f16_u64(a); }
68template <> [[gnu::always_inline]] nce float64x2_t reinterpret(uint64x2_t a) { return vreinterpretq_f64_u64(a); }
69template <> [[gnu::always_inline]] nce float16x4_t reinterpret(uint32x2_t a) { return vreinterpret_f16_u32(a); }
70template <> [[gnu::always_inline]] nce float32x4_t convert(float16x4_t a) { return vcvt_f32_f16(a); }
71template <> [[gnu::always_inline]] nce int8x8_t reinterpret(float16x4_t a) { return vreinterpret_s8_f16(a); }
72template <> [[gnu::always_inline]] nce int16x4_t reinterpret(float16x4_t a) { return vreinterpret_s16_f16(a); }
73template <> [[gnu::always_inline]] nce int32x2_t reinterpret(float16x4_t a) { return vreinterpret_s32_f16(a); }
74template <> [[gnu::always_inline]] nce float32x2_t reinterpret(float16x4_t a) { return vreinterpret_f32_f16(a); }
75template <> [[gnu::always_inline]] nce uint8x8_t reinterpret(float16x4_t a) { return vreinterpret_u8_f16(a); }
76template <> [[gnu::always_inline]] nce uint16x4_t reinterpret(float16x4_t a) { return vreinterpret_u16_f16(a); }
77template <> [[gnu::always_inline]] nce uint32x2_t reinterpret(float16x4_t a) { return vreinterpret_u32_f16(a); }
78template <> [[gnu::always_inline]] nce poly8x8_t reinterpret(float16x4_t a) { return vreinterpret_p8_f16(a); }
79template <> [[gnu::always_inline]] nce poly16x4_t reinterpret(float16x4_t a) { return vreinterpret_p16_f16(a); }
80template <> [[gnu::always_inline]] nce uint64x1_t reinterpret(float16x4_t a) { return vreinterpret_u64_f16(a); }
81template <> [[gnu::always_inline]] nce int64x1_t reinterpret(float16x4_t a) { return vreinterpret_s64_f16(a); }
82[[gnu::always_inline]] nce float16x8_t combine(float16x4_t low, float16x4_t high) { return vcombine_f16(low, high); }
83template <int lane>[[gnu::always_inline]] nce float16_t get_lane(float16x4_t v) { return vget_lane_f16(v, lane); }
84[[gnu::always_inline]] nce float16x4x2_t zip(float16x4_t a, float16x4_t b) { return vzip_f16(a, b); }
85[[gnu::always_inline]] nce float16x4x2_t unzip(float16x4_t a, float16x4_t b) { return vuzp_f16(a, b); }
86[[gnu::always_inline]] nce float16x4x2_t transpose(float16x4_t a, float16x4_t b) { return vtrn_f16(a, b); }
87template <int lane>[[gnu::always_inline]] nce float16x4_t duplicate_lane(float16x4_t vec) { return vdup_lane_f16(vec, lane); }
88template <int lane>[[gnu::always_inline]] nce float16x8_t duplicate_lane(float16x4_t vec) { return vdupq_lane_f16(vec, lane); }
89template <int n>[[gnu::always_inline]] nce float16x4_t extract(float16x4_t a, float16x4_t b) { return vext_f16(a, b, n); }
90[[gnu::always_inline]] nce float16x4_t reverse_64bit(float16x4_t vec) { return vrev64_f16(vec); }
91template <> [[gnu::always_inline]] nce poly8x16_t reinterpret(float16x8_t a) { return vreinterpretq_p8_f16(a); }
92template <> [[gnu::always_inline]] nce poly16x8_t reinterpret(float16x8_t a) { return vreinterpretq_p16_f16(a); }
93[[gnu::always_inline]] nce float16x4_t get_high(float16x8_t a) { return vget_high_f16(a); }
94[[gnu::always_inline]] nce float16x4_t get_low(float16x8_t a) { return vget_low_f16(a); }
95[[gnu::always_inline]] nce float16x8x2_t zip(float16x8_t a, float16x8_t b) { return vzipq_f16(a, b); }
96[[gnu::always_inline]] nce float16x8x2_t unzip(float16x8_t a, float16x8_t b) { return vuzpq_f16(a, b); }
97[[gnu::always_inline]] nce float16x8x2_t transpose(float16x8_t a, float16x8_t b) { return vtrnq_f16(a, b); }
98template <int n>[[gnu::always_inline]] nce float16x8_t extract(float16x8_t a, float16x8_t b) { return vextq_f16(a, b, n); }
99[[gnu::always_inline]] nce float16x8_t reverse_64bit(float16x8_t vec) { return vrev64q_f16(vec); }
100template <> [[gnu::always_inline]] nce float16x4_t reinterpret(float32x2_t a) { return vreinterpret_f16_f32(a); }
101template <> [[gnu::always_inline]] nce float16x4_t convert(float32x4_t a) { return vcvt_f16_f32(a); }
102template <> [[gnu::always_inline]] nce float16x4_t reinterpret(poly8x8_t a) { return vreinterpret_f16_p8(a); }
103template <> [[gnu::always_inline]] nce float16x4_t reinterpret(poly16x4_t a) { return vreinterpret_f16_p16(a); }
104template <> [[gnu::always_inline]] nce float16x4_t reinterpret(int64x1_t a) { return vreinterpret_f16_s64(a); }
105template <> [[gnu::always_inline]] nce float16x8_t reinterpret(poly8x16_t a) { return vreinterpretq_f16_p8(a); }
106template <> [[gnu::always_inline]] nce float16x8_t reinterpret(poly16x8_t a) { return vreinterpretq_f16_p16(a); }
107template <> [[gnu::always_inline]] nce float16x4_t create(uint64_t a) { return vcreate_f16(a); }
108template <int lane>[[gnu::always_inline]] nce float16x4_t set_lane(float16_t a, float16x4_t v) { return vset_lane_f16(a, v, lane); }
109template <> [[gnu::always_inline]] inline float16x4_t load1(float16_t const *ptr) { return vld1_f16(ptr); }
110template <> [[gnu::always_inline]] inline float16x8_t load1(float16_t const *ptr) { return vld1q_f16(ptr); }
111template <int lane>[[gnu::always_inline]] nce float16x4_t load1_lane(float16_t const *ptr, float16x4_t src) { return vld1_lane_f16(ptr, src, lane); }
112template <int lane>[[gnu::always_inline]] nce float16x8_t load1_lane(float16_t const *ptr, float16x8_t src) { return vld1q_lane_f16(ptr, src, lane); }
113template <> [[gnu::always_inline]] inline float16x4_t load1_duplicate(float16_t const *ptr) { return vld1_dup_f16(ptr); }
114template <> [[gnu::always_inline]] inline float16x8_t load1_duplicate(float16_t const *ptr) { return vld1q_dup_f16(ptr); }
115template <> [[gnu::always_inline]] inline float16x4x2_t load2(float16_t const *ptr) { return vld2_f16(ptr); }
116template <> [[gnu::always_inline]] inline float16x8x2_t load2(float16_t const *ptr) { return vld2q_f16(ptr); }
117template <> [[gnu::always_inline]] inline float16x4x3_t load3(float16_t const *ptr) { return vld3_f16(ptr); }
118template <> [[gnu::always_inline]] inline float16x8x3_t load3(float16_t const *ptr) { return vld3q_f16(ptr); }
119template <> [[gnu::always_inline]] inline float16x4x4_t load4(float16_t const *ptr) { return vld4_f16(ptr); }
120template <> [[gnu::always_inline]] inline float16x8x4_t load4(float16_t const *ptr) { return vld4q_f16(ptr); }
121template <> [[gnu::always_inline]] inline float16x4x2_t load2_duplicate(float16_t const *ptr) { return vld2_dup_f16(ptr); }
122template <> [[gnu::always_inline]] inline float16x8x2_t load2_duplicate(float16_t const *ptr) { return vld2q_dup_f16(ptr); }
123template <> [[gnu::always_inline]] inline float16x4x3_t load3_duplicate(float16_t const *ptr) { return vld3_dup_f16(ptr); }
124template <> [[gnu::always_inline]] inline float16x8x3_t load3_duplicate(float16_t const *ptr) { return vld3q_dup_f16(ptr); }
125template <> [[gnu::always_inline]] inline float16x4x4_t load4_duplicate(float16_t const *ptr) { return vld4_dup_f16(ptr); }
126template <> [[gnu::always_inline]] inline float16x8x4_t load4_duplicate(float16_t const *ptr) { return vld4q_dup_f16(ptr); }
127template <int lane>[[gnu::always_inline]] nce float16x4x2_t load2_lane(float16_t const *ptr, float16x4x2_t src) { return vld2_lane_f16(ptr, src, lane); }
128template <int lane>[[gnu::always_inline]] nce float16x8x2_t load2_lane(float16_t const *ptr, float16x8x2_t src) { return vld2q_lane_f16(ptr, src, lane); }
129template <int lane>[[gnu::always_inline]] nce float16x4x3_t load3_lane(float16_t const *ptr, float16x4x3_t src) { return vld3_lane_f16(ptr, src, lane); }
130template <int lane>[[gnu::always_inline]] nce float16x8x3_t load3_lane(float16_t const *ptr, float16x8x3_t src) { return vld3q_lane_f16(ptr, src, lane); }
131template <int lane>[[gnu::always_inline]] nce float16x4x4_t load4_lane(float16_t const *ptr, float16x4x4_t src) { return vld4_lane_f16(ptr, src, lane); }
132template <int lane>[[gnu::always_inline]] nce float16x8x4_t load4_lane(float16_t const *ptr, float16x8x4_t src) { return vld4q_lane_f16(ptr, src, lane); }
133template <> [[gnu::always_inline]] inline float16x4x2_t load1_x2(float16_t const *ptr) { return vld1_f16_x2(ptr); }
134template <> [[gnu::always_inline]] inline float16x8x2_t load1_x2(float16_t const *ptr) { return vld1q_f16_x2(ptr); }
135template <> [[gnu::always_inline]] inline float16x4x3_t load1_x3(float16_t const *ptr) { return vld1_f16_x3(ptr); }
136template <> [[gnu::always_inline]] inline float16x8x3_t load1_x3(float16_t const *ptr) { return vld1q_f16_x3(ptr); }
137template <> [[gnu::always_inline]] inline float16x4x4_t load1_x4(float16_t const *ptr) { return vld1_f16_x4(ptr); }
138template <> [[gnu::always_inline]] inline float16x8x4_t load1_x4(float16_t const *ptr) { return vld1q_f16_x4(ptr); }
139template <> [[gnu::always_inline]] inline void store1(float16_t *ptr, float16x4_t val) { return vst1_f16(ptr, val); }
140template <> [[gnu::always_inline]] inline void store1(float16_t *ptr, float16x8_t val) { return vst1q_f16(ptr, val); }
141template <int lane>[[gnu::always_inline]] nce void store1_lane(float16_t *ptr, float16x4_t val) { return vst1_lane_f16(ptr, val, lane); }
142template <int lane>[[gnu::always_inline]] nce void store1_lane(float16_t *ptr, float16x8_t val) { return vst1q_lane_f16(ptr, val, lane); }
143template <> [[gnu::always_inline]] inline void store2(float16_t *ptr, float16x4x2_t val) { return vst2_f16(ptr, val); }
144template <> [[gnu::always_inline]] inline void store2(float16_t *ptr, float16x8x2_t val) { return vst2q_f16(ptr, val); }
145template <> [[gnu::always_inline]] inline void store3(float16_t *ptr, float16x4x3_t val) { return vst3_f16(ptr, val); }
146template <> [[gnu::always_inline]] inline void store3(float16_t *ptr, float16x8x3_t val) { return vst3q_f16(ptr, val); }
147template <> [[gnu::always_inline]] inline void store4(float16_t *ptr, float16x4x4_t val) { return vst4_f16(ptr, val); }
148template <> [[gnu::always_inline]] inline void store4(float16_t *ptr, float16x8x4_t val) { return vst4q_f16(ptr, val); }
149template <int lane>[[gnu::always_inline]] nce void store2_lane(float16_t *ptr, float16x4x2_t val) { return vst2_lane_f16(ptr, val, lane); }
150template <int lane>[[gnu::always_inline]] nce void store2_lane(float16_t *ptr, float16x8x2_t val) { return vst2q_lane_f16(ptr, val, lane); }
151template <int lane>[[gnu::always_inline]] nce void store3_lane(float16_t *ptr, float16x4x3_t val) { return vst3_lane_f16(ptr, val, lane); }
152template <int lane>[[gnu::always_inline]] nce void store3_lane(float16_t *ptr, float16x8x3_t val) { return vst3q_lane_f16(ptr, val, lane); }
153template <int lane>[[gnu::always_inline]] nce void store4_lane(float16_t *ptr, float16x4x4_t val) { return vst4_lane_f16(ptr, val, lane); }
154template <int lane>[[gnu::always_inline]] nce void store4_lane(float16_t *ptr, float16x8x4_t val) { return vst4q_lane_f16(ptr, val, lane); }
155[[gnu::always_inline]] inline void store1_x2(float16_t *ptr, float16x4x2_t val) { return vst1_f16_x2(ptr, val); }
156[[gnu::always_inline]] inline void store1_x2(float16_t *ptr, float16x8x2_t val) { return vst1q_f16_x2(ptr, val); }
157[[gnu::always_inline]] inline void store1_x3(float16_t *ptr, float16x4x3_t val) { return vst1_f16_x3(ptr, val); }
158[[gnu::always_inline]] inline void store1_x3(float16_t *ptr, float16x8x3_t val) { return vst1q_f16_x3(ptr, val); }
159[[gnu::always_inline]] inline void store1_x4(float16_t *ptr, float16x4x4_t val) { return vst1_f16_x4(ptr, val); }
160[[gnu::always_inline]] inline void store1_x4(float16_t *ptr, float16x8x4_t val) { return vst1q_f16_x4(ptr, val); }
161template <> [[gnu::always_inline]] nce float16x4_t duplicate(float16_t value) { return vdup_n_f16(value); }
162template <> [[gnu::always_inline]] nce float16x8_t duplicate(float16_t value) { return vdupq_n_f16(value); }
163template <> [[gnu::always_inline]] nce float16x4_t move(float16_t value) { return vmov_n_f16(value); }
164template <> [[gnu::always_inline]] nce float16x8_t move(float16_t value) { return vmovq_n_f16(value); }
165[[gnu::always_inline]] nce float32x2_t multiply_add_fused(float32x2_t a, float32x2_t b, float32x2_t c) { return vfma_f32(a, b, c);}
166[[gnu::always_inline]] nce float32x2_t multiply_subtract_fused(float32x2_t a, float32x2_t b, float32x2_t c) { return vfms_f32(a, b, c); }
167[[gnu::always_inline]] nce float32x4_t multiply_add_fused(float32x4_t a, float32x4_t b, float32x4_t c) { return vfmaq_f32(a, b, c);}
168[[gnu::always_inline]] nce float32x4_t multiply_subtract_fused(float32x4_t a, float32x4_t b, float32x4_t c) { return vfmsq_f32(a, b, c); }
169[[gnu::always_inline]] nce float32x2_t multiply_add_fused(float32x2_t a, float32x2_t b, float32_t c) { return vfma_n_f32(a, b, c);}
170[[gnu::always_inline]] nce float32x2_t multiply_subtract_fused(float32x2_t a, float32x2_t b, float32_t c) { return vfms_n_f32(a, b, c); }
171[[gnu::always_inline]] nce float32x4_t multiply_add_fused(float32x4_t a, float32x4_t b, float32_t c) { return vfmaq_n_f32(a, b, c);}
172[[gnu::always_inline]] nce float32x4_t multiply_subtract_fused(float32x4_t a, float32x4_t b, float32_t c) { return vfmsq_n_f32(a, b, c); }
173// clang-format on
174} // namespace neon
175#undef nce
176#endif // __cplusplus