@@ -336,6 +336,14 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
336336 * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
337337 * fp0 is the same for fp0 of result.
338338 */
339+ #if defined(__aarch64__ )
340+ #define _MN_SHUFFLE (fp3 ,fp2 ,fp1 ,fp0 ) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
341+ 2), (((fp2)*4)+3), (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3), (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } )
342+ #define _MF_SHUFFLE (fp3 ,fp2 ,fp1 ,fp0 ) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
343+ 2), (((fp2)*4)+3), (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3), (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*\
344+ 4)+16+3) } )
345+ #endif
346+
339347#define _MM_SHUFFLE (fp3 , fp2 , fp1 , fp0 ) \
340348 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
341349
@@ -2822,7 +2830,7 @@ FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
28222830FORCE_INLINE void _mm_stream_ps (float * p , __m128 a )
28232831{
28242832#if __has_builtin (__builtin_nontemporal_store )
2825- __builtin_nontemporal_store (a , (float32x4_t * ) p );
2833+ __builtin_nontemporal_store (reinterpret_cast < float32x4_t > ( a ) , (float32x4_t * ) p );
28262834#else
28272835 vst1q_f32 (p , vreinterpretq_f32_m128 (a ));
28282836#endif
@@ -5660,7 +5668,7 @@ FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
56605668FORCE_INLINE void _mm_stream_pd (double * p , __m128d a )
56615669{
56625670#if __has_builtin (__builtin_nontemporal_store )
5663- __builtin_nontemporal_store (a , (__m128d * ) p );
5671+ __builtin_nontemporal_store (reinterpret_cast < float32x4_t > ( a ) , (float32x4_t * ) p );
56645672#elif defined(__aarch64__ ) || defined(_M_ARM64 )
56655673 vst1q_f64 (p , vreinterpretq_f64_m128d (a ));
56665674#else
@@ -6809,14 +6817,14 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
68096817 _sse2neon_define2( \
68106818 __m128i, a, b, \
68116819 const uint16_t _mask[8] = \
6812- _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0 , \
6813- ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0 , \
6814- ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0 , \
6815- ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0 , \
6816- ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0 , \
6817- ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0 , \
6818- ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0 , \
6819- ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0 ); \
6820+ _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6821+ ((imm) & (1 << 1)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6822+ ((imm) & (1 << 2)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6823+ ((imm) & (1 << 3)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6824+ ((imm) & (1 << 4)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6825+ ((imm) & (1 << 5)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6826+ ((imm) & (1 << 6)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6827+ ((imm) & (1 << 7)) ? (uint16_t)0xffff : (uint16_t)0x0000 ); \
68206828 uint16x8_t _mask_vec = vld1q_u16(_mask); \
68216829 uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
68226830 uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
0 commit comments