2#include "vfpv3_int.hpp"
4#ifdef __ARM_FEATURE_MVE
20template <
typename T> nce T convert(float32x4_t a);
21template <
typename T> nce T convert(int32x4_t a);
22template <
typename T> nce T convert(uint32x4_t a);
23template <
typename T> nce T reinterpret(float32x4_t a);
24template <> [[gnu::always_inline]] nce float32x4_t reinterpret(uint8x16_t a) {
return vreinterpretq_f32_u8(a); }
25template <> [[gnu::always_inline]] nce float32x4_t reinterpret(int8x16_t a) {
return vreinterpretq_f32_s8(a); }
26template <> [[gnu::always_inline]] nce float32x4_t reinterpret(uint16x8_t a) {
return vreinterpretq_f32_u16(a); }
27template <> [[gnu::always_inline]] nce float32x4_t reinterpret(int16x8_t a) {
return vreinterpretq_f32_s16(a); }
28template <> [[gnu::always_inline]] nce float32x4_t convert(int32x4_t a) {
return vcvtq_f32_s32(a); }
29template <> [[gnu::always_inline]] nce float32x4_t reinterpret(int32x4_t a) {
return vreinterpretq_f32_s32(a); }
30template <> [[gnu::always_inline]] nce float32x4_t reinterpret(uint64x2_t a) {
return vreinterpretq_f32_u64(a); }
31template <> [[gnu::always_inline]] nce float32x4_t convert(uint32x4_t a) {
return vcvtq_f32_u32(a); }
32template <> [[gnu::always_inline]] nce float32x4_t reinterpret(uint32x4_t a) {
return vreinterpretq_f32_u32(a); }
33[[gnu::always_inline]] nce float32x4_t add(float32x4_t a, float32x4_t b) {
return vaddq_f32(a, b); }
34[[gnu::always_inline]] nce float32x4_t multiply(float32x4_t a, float32x4_t b) {
return vmulq_f32(a, b); }
35[[gnu::always_inline]] nce float32x4_t subtract(float32x4_t a, float32x4_t b) {
return vsubq_f32(a, b); }
36[[gnu::always_inline]] nce float32x4_t subtract_absolute(float32x4_t a, float32x4_t b) {
return vabdq_f32(a, b); }
37[[gnu::always_inline]] nce float32x4_t abs(float32x4_t a) {
return vabsq_f32(a); }
38template <> [[gnu::always_inline]] nce int32x4_t convert(float32x4_t a) {
return vcvtq_s32_f32(a); }
39template <> [[gnu::always_inline]] nce uint32x4_t convert(float32x4_t a) {
return vcvtq_u32_f32(a); }
40template <> [[gnu::always_inline]] nce int8x16_t reinterpret(float32x4_t a) {
return vreinterpretq_s8_f32(a); }
41template <> [[gnu::always_inline]] nce int16x8_t reinterpret(float32x4_t a) {
return vreinterpretq_s16_f32(a); }
42template <> [[gnu::always_inline]] nce int32x4_t reinterpret(float32x4_t a) {
return vreinterpretq_s32_f32(a); }
43template <> [[gnu::always_inline]] nce uint8x16_t reinterpret(float32x4_t a) {
return vreinterpretq_u8_f32(a); }
44template <> [[gnu::always_inline]] nce uint16x8_t reinterpret(float32x4_t a) {
return vreinterpretq_u16_f32(a); }
45template <> [[gnu::always_inline]] nce uint32x4_t reinterpret(float32x4_t a) {
return vreinterpretq_u32_f32(a); }
46template <> [[gnu::always_inline]] nce uint64x2_t reinterpret(float32x4_t a) {
return vreinterpretq_u64_f32(a); }
47template <> [[gnu::always_inline]] nce int64x2_t reinterpret(float32x4_t a) {
return vreinterpretq_s64_f32(a); }
48[[gnu::always_inline]] nce float32x4_t multiply(float32x4_t a, float32_t b) {
return vmulq_n_f32(a, b); }
49[[gnu::always_inline]] nce float32x4_t negate(float32x4_t a) {
return vnegq_f32(a); }
50[[gnu::always_inline]] nce float32x4_t reverse_64bit(float32x4_t a) {
return vrev64q_f32(a); }
51template <> [[gnu::always_inline]] nce float32x4_t reinterpret(int64x2_t a) {
return vreinterpretq_f32_s64(a); }
52template <
int lane>[[gnu::always_inline]] nce float32_t get_lane(float32x4_t v) {
return vgetq_lane_f32(v, lane); }
53template <
int lane>[[gnu::always_inline]] nce float32x4_t set_lane(float32_t a, float32x4_t v) {
return vsetq_lane_f32(a, v, lane); }
54[[gnu::always_inline]]
inline void store1(float32_t *ptr, float32x4_t val) {
return vst1q_f32(ptr, val); }