|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <xmmintrin.h> |
|
|
|
|
|
|
|
#define USE_SSE2 |
|
|
|
#ifdef _MSC_VER |
|
# define ALIGN16_BEG __declspec(align(16)) |
|
# define ALIGN16_END |
|
#else |
|
# define ALIGN16_BEG |
|
# define ALIGN16_END __attribute__((aligned(16))) |
|
#endif |
|
|
|
|
|
typedef __m128 v4sf; |
|
|
|
#ifdef USE_SSE2 |
|
# include <emmintrin.h> |
|
typedef __m128i v4si; |
|
#else |
|
typedef __m64 v2si; |
|
#endif |
|
|
|
|
|
#define _PS_CONST(Name, Val) \ |
|
static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } |
|
#define _PI32_CONST(Name, Val) \ |
|
static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val } |
|
#define _PS_CONST_TYPE(Name, Type, Val) \ |
|
static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } |
|
|
|
_PS_CONST(1 , 1.0f); |
|
_PS_CONST(0p5, 0.5f); |
|
|
|
_PS_CONST_TYPE(min_norm_pos, int, 0x00800000); |
|
_PS_CONST_TYPE(mant_mask, int, 0x7f800000); |
|
_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); |
|
|
|
_PS_CONST_TYPE(sign_mask, int, 0x80000000); |
|
_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); |
|
|
|
_PI32_CONST(1, 1); |
|
_PI32_CONST(inv1, ~1); |
|
_PI32_CONST(2, 2); |
|
_PI32_CONST(4, 4); |
|
_PI32_CONST(0x7f, 0x7f); |
|
|
|
_PS_CONST(cephes_SQRTHF, 0.707106781186547524); |
|
_PS_CONST(cephes_log_p0, 7.0376836292E-2); |
|
_PS_CONST(cephes_log_p1, - 1.1514610310E-1); |
|
_PS_CONST(cephes_log_p2, 1.1676998740E-1); |
|
_PS_CONST(cephes_log_p3, - 1.2420140846E-1); |
|
_PS_CONST(cephes_log_p4, + 1.4249322787E-1); |
|
_PS_CONST(cephes_log_p5, - 1.6668057665E-1); |
|
_PS_CONST(cephes_log_p6, + 2.0000714765E-1); |
|
_PS_CONST(cephes_log_p7, - 2.4999993993E-1); |
|
_PS_CONST(cephes_log_p8, + 3.3333331174E-1); |
|
_PS_CONST(cephes_log_q1, -2.12194440e-4); |
|
_PS_CONST(cephes_log_q2, 0.693359375); |
|
|
|
#if defined (__MINGW32__) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline __m128 my_movehl_ps(__m128 a, const __m128 b) { |
|
asm ( |
|
"movhlps %2,%0\n\t" |
|
: "=x" (a) |
|
: "0" (a), "x"(b) |
|
); |
|
return a; } |
|
#warning "redefined _mm_movehl_ps (see gcc bug 21179)" |
|
#define _mm_movehl_ps my_movehl_ps |
|
|
|
inline __m128 my_cmplt_ps(__m128 a, const __m128 b) { |
|
asm ( |
|
"cmpltps %2,%0\n\t" |
|
: "=x" (a) |
|
: "0" (a), "x"(b) |
|
); |
|
return a; |
|
} |
|
inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) { |
|
asm ( |
|
"cmpnleps %2,%0\n\t" |
|
: "=x" (a) |
|
: "0" (a), "x"(b) |
|
); |
|
return a; |
|
} |
|
inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) { |
|
asm ( |
|
"cmpeqps %2,%0\n\t" |
|
: "=x" (a) |
|
: "0" (a), "x"(b) |
|
); |
|
return a; |
|
} |
|
#warning "redefined _mm_cmpxx_ps functions..." |
|
#define _mm_cmplt_ps my_cmplt_ps |
|
#define _mm_cmpgt_ps my_cmpgt_ps |
|
#define _mm_cmpeq_ps my_cmpeq_ps |
|
#endif |
|
|
|
#ifndef USE_SSE2 |
|
typedef union xmm_mm_union { |
|
__m128 xmm; |
|
__m64 mm[2]; |
|
} xmm_mm_union; |
|
|
|
#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \ |
|
xmm_mm_union u; u.xmm = xmm_; \ |
|
mm0_ = u.mm[0]; \ |
|
mm1_ = u.mm[1]; \ |
|
} |
|
|
|
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \ |
|
xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \ |
|
} |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
v4sf log_ps(v4sf x) { |
|
#ifdef USE_SSE2 |
|
v4si emm0; |
|
#else |
|
v2si mm0, mm1; |
|
#endif |
|
v4sf one = *(v4sf*)_ps_1; |
|
|
|
v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); |
|
|
|
x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); |
|
|
|
#ifndef USE_SSE2 |
|
|
|
COPY_XMM_TO_MM(x, mm0, mm1); |
|
mm0 = _mm_srli_pi32(mm0, 23); |
|
mm1 = _mm_srli_pi32(mm1, 23); |
|
#else |
|
emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); |
|
#endif |
|
|
|
x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask); |
|
x = _mm_or_ps(x, *(v4sf*)_ps_0p5); |
|
|
|
#ifndef USE_SSE2 |
|
|
|
mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f); |
|
mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f); |
|
v4sf e = _mm_cvtpi32x2_ps(mm0, mm1); |
|
_mm_empty(); |
|
#else |
|
emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f); |
|
v4sf e = _mm_cvtepi32_ps(emm0); |
|
#endif |
|
|
|
e = _mm_add_ps(e, one); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF); |
|
v4sf tmp = _mm_and_ps(x, mask); |
|
x = _mm_sub_ps(x, one); |
|
e = _mm_sub_ps(e, _mm_and_ps(one, mask)); |
|
x = _mm_add_ps(x, tmp); |
|
|
|
|
|
v4sf z = _mm_mul_ps(x,x); |
|
|
|
v4sf y = *(v4sf*)_ps_cephes_log_p0; |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1); |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2); |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3); |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4); |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5); |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6); |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7); |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8); |
|
y = _mm_mul_ps(y, x); |
|
|
|
y = _mm_mul_ps(y, z); |
|
|
|
|
|
tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1); |
|
y = _mm_add_ps(y, tmp); |
|
|
|
|
|
tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); |
|
y = _mm_sub_ps(y, tmp); |
|
|
|
tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2); |
|
x = _mm_add_ps(x, y); |
|
x = _mm_add_ps(x, tmp); |
|
x = _mm_or_ps(x, invalid_mask); |
|
return x; |
|
} |
|
|
|
_PS_CONST(exp_hi, 88.3762626647949f); |
|
_PS_CONST(exp_lo, -88.3762626647949f); |
|
|
|
_PS_CONST(cephes_LOG2EF, 1.44269504088896341); |
|
_PS_CONST(cephes_exp_C1, 0.693359375); |
|
_PS_CONST(cephes_exp_C2, -2.12194440e-4); |
|
|
|
_PS_CONST(cephes_exp_p0, 1.9875691500E-4); |
|
_PS_CONST(cephes_exp_p1, 1.3981999507E-3); |
|
_PS_CONST(cephes_exp_p2, 8.3334519073E-3); |
|
_PS_CONST(cephes_exp_p3, 4.1665795894E-2); |
|
_PS_CONST(cephes_exp_p4, 1.6666665459E-1); |
|
_PS_CONST(cephes_exp_p5, 5.0000001201E-1); |
|
|
|
v4sf exp_ps(v4sf x) { |
|
v4sf tmp = _mm_setzero_ps(), fx; |
|
#ifdef USE_SSE2 |
|
v4si emm0; |
|
#else |
|
v2si mm0, mm1; |
|
#endif |
|
v4sf one = *(v4sf*)_ps_1; |
|
|
|
x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi); |
|
x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo); |
|
|
|
|
|
fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF); |
|
fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5); |
|
|
|
|
|
#ifndef USE_SSE2 |
|
|
|
tmp = _mm_movehl_ps(tmp, fx); |
|
mm0 = _mm_cvttps_pi32(fx); |
|
mm1 = _mm_cvttps_pi32(tmp); |
|
|
|
tmp = _mm_cvtpi32x2_ps(mm0, mm1); |
|
#else |
|
emm0 = _mm_cvttps_epi32(fx); |
|
tmp = _mm_cvtepi32_ps(emm0); |
|
#endif |
|
|
|
v4sf mask = _mm_cmpgt_ps(tmp, fx); |
|
mask = _mm_and_ps(mask, one); |
|
fx = _mm_sub_ps(tmp, mask); |
|
|
|
tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1); |
|
v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2); |
|
x = _mm_sub_ps(x, tmp); |
|
x = _mm_sub_ps(x, z); |
|
|
|
z = _mm_mul_ps(x,x); |
|
|
|
v4sf y = *(v4sf*)_ps_cephes_exp_p0; |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1); |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2); |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3); |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4); |
|
y = _mm_mul_ps(y, x); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5); |
|
y = _mm_mul_ps(y, z); |
|
y = _mm_add_ps(y, x); |
|
y = _mm_add_ps(y, one); |
|
|
|
|
|
#ifndef USE_SSE2 |
|
z = _mm_movehl_ps(z, fx); |
|
mm0 = _mm_cvttps_pi32(fx); |
|
mm1 = _mm_cvttps_pi32(z); |
|
mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f); |
|
mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f); |
|
mm0 = _mm_slli_pi32(mm0, 23); |
|
mm1 = _mm_slli_pi32(mm1, 23); |
|
|
|
v4sf pow2n; |
|
COPY_MM_TO_XMM(mm0, mm1, pow2n); |
|
_mm_empty(); |
|
#else |
|
emm0 = _mm_cvttps_epi32(fx); |
|
emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f); |
|
emm0 = _mm_slli_epi32(emm0, 23); |
|
v4sf pow2n = _mm_castsi128_ps(emm0); |
|
#endif |
|
y = _mm_mul_ps(y, pow2n); |
|
return y; |
|
} |
|
|
|
_PS_CONST(minus_cephes_DP1, -0.78515625); |
|
_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); |
|
_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); |
|
_PS_CONST(sincof_p0, -1.9515295891E-4); |
|
_PS_CONST(sincof_p1, 8.3321608736E-3); |
|
_PS_CONST(sincof_p2, -1.6666654611E-1); |
|
_PS_CONST(coscof_p0, 2.443315711809948E-005); |
|
_PS_CONST(coscof_p1, -1.388731625493765E-003); |
|
_PS_CONST(coscof_p2, 4.166664568298827E-002); |
|
_PS_CONST(cephes_FOPI, 1.27323954473516); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
v4sf sin_ps(v4sf x) { |
|
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y; |
|
|
|
#ifdef USE_SSE2 |
|
v4si emm0, emm2; |
|
#else |
|
v2si mm0, mm1, mm2, mm3; |
|
#endif |
|
sign_bit = x; |
|
|
|
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); |
|
|
|
sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); |
|
|
|
|
|
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); |
|
|
|
|
|
#ifdef USE_SSE2 |
|
|
|
emm2 = _mm_cvttps_epi32(y); |
|
|
|
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); |
|
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); |
|
y = _mm_cvtepi32_ps(emm2); |
|
|
|
emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); |
|
emm0 = _mm_slli_epi32(emm0, 29); |
|
|
|
|
|
|
|
|
|
|
|
|
|
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); |
|
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); |
|
|
|
v4sf swap_sign_bit = _mm_castsi128_ps(emm0); |
|
v4sf poly_mask = _mm_castsi128_ps(emm2); |
|
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); |
|
#else |
|
|
|
xmm2 = _mm_movehl_ps(xmm2, y); |
|
mm2 = _mm_cvttps_pi32(y); |
|
mm3 = _mm_cvttps_pi32(xmm2); |
|
|
|
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); |
|
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); |
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); |
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); |
|
y = _mm_cvtpi32x2_ps(mm2, mm3); |
|
|
|
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4); |
|
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4); |
|
mm0 = _mm_slli_pi32(mm0, 29); |
|
mm1 = _mm_slli_pi32(mm1, 29); |
|
|
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); |
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); |
|
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); |
|
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); |
|
v4sf swap_sign_bit, poly_mask; |
|
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit); |
|
COPY_MM_TO_XMM(mm2, mm3, poly_mask); |
|
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); |
|
_mm_empty(); |
|
#endif |
|
|
|
|
|
|
|
xmm1 = *(v4sf*)_ps_minus_cephes_DP1; |
|
xmm2 = *(v4sf*)_ps_minus_cephes_DP2; |
|
xmm3 = *(v4sf*)_ps_minus_cephes_DP3; |
|
xmm1 = _mm_mul_ps(y, xmm1); |
|
xmm2 = _mm_mul_ps(y, xmm2); |
|
xmm3 = _mm_mul_ps(y, xmm3); |
|
x = _mm_add_ps(x, xmm1); |
|
x = _mm_add_ps(x, xmm2); |
|
x = _mm_add_ps(x, xmm3); |
|
|
|
|
|
y = *(v4sf*)_ps_coscof_p0; |
|
v4sf z = _mm_mul_ps(x,x); |
|
|
|
y = _mm_mul_ps(y, z); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); |
|
y = _mm_mul_ps(y, z); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); |
|
y = _mm_mul_ps(y, z); |
|
y = _mm_mul_ps(y, z); |
|
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); |
|
y = _mm_sub_ps(y, tmp); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_1); |
|
|
|
|
|
|
|
v4sf y2 = *(v4sf*)_ps_sincof_p0; |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_mul_ps(y2, x); |
|
y2 = _mm_add_ps(y2, x); |
|
|
|
|
|
xmm3 = poly_mask; |
|
y2 = _mm_and_ps(xmm3, y2); |
|
y = _mm_andnot_ps(xmm3, y); |
|
y = _mm_add_ps(y,y2); |
|
|
|
y = _mm_xor_ps(y, sign_bit); |
|
|
|
return y; |
|
} |
|
|
|
|
|
v4sf cos_ps(v4sf x) { |
|
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; |
|
#ifdef USE_SSE2 |
|
v4si emm0, emm2; |
|
#else |
|
v2si mm0, mm1, mm2, mm3; |
|
#endif |
|
|
|
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); |
|
|
|
|
|
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); |
|
|
|
#ifdef USE_SSE2 |
|
|
|
emm2 = _mm_cvttps_epi32(y); |
|
|
|
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); |
|
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); |
|
y = _mm_cvtepi32_ps(emm2); |
|
|
|
emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2); |
|
|
|
|
|
emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4); |
|
emm0 = _mm_slli_epi32(emm0, 29); |
|
|
|
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); |
|
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); |
|
|
|
v4sf sign_bit = _mm_castsi128_ps(emm0); |
|
v4sf poly_mask = _mm_castsi128_ps(emm2); |
|
#else |
|
|
|
xmm2 = _mm_movehl_ps(xmm2, y); |
|
mm2 = _mm_cvttps_pi32(y); |
|
mm3 = _mm_cvttps_pi32(xmm2); |
|
|
|
|
|
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); |
|
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); |
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); |
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); |
|
|
|
y = _mm_cvtpi32x2_ps(mm2, mm3); |
|
|
|
|
|
mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2); |
|
mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2); |
|
|
|
|
|
|
|
|
|
mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4); |
|
mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4); |
|
mm0 = _mm_slli_pi32(mm0, 29); |
|
mm1 = _mm_slli_pi32(mm1, 29); |
|
|
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); |
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); |
|
|
|
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); |
|
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); |
|
|
|
v4sf sign_bit, poly_mask; |
|
COPY_MM_TO_XMM(mm0, mm1, sign_bit); |
|
COPY_MM_TO_XMM(mm2, mm3, poly_mask); |
|
_mm_empty(); |
|
#endif |
|
|
|
|
|
xmm1 = *(v4sf*)_ps_minus_cephes_DP1; |
|
xmm2 = *(v4sf*)_ps_minus_cephes_DP2; |
|
xmm3 = *(v4sf*)_ps_minus_cephes_DP3; |
|
xmm1 = _mm_mul_ps(y, xmm1); |
|
xmm2 = _mm_mul_ps(y, xmm2); |
|
xmm3 = _mm_mul_ps(y, xmm3); |
|
x = _mm_add_ps(x, xmm1); |
|
x = _mm_add_ps(x, xmm2); |
|
x = _mm_add_ps(x, xmm3); |
|
|
|
|
|
y = *(v4sf*)_ps_coscof_p0; |
|
v4sf z = _mm_mul_ps(x,x); |
|
|
|
y = _mm_mul_ps(y, z); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); |
|
y = _mm_mul_ps(y, z); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); |
|
y = _mm_mul_ps(y, z); |
|
y = _mm_mul_ps(y, z); |
|
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); |
|
y = _mm_sub_ps(y, tmp); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_1); |
|
|
|
|
|
|
|
v4sf y2 = *(v4sf*)_ps_sincof_p0; |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_mul_ps(y2, x); |
|
y2 = _mm_add_ps(y2, x); |
|
|
|
|
|
xmm3 = poly_mask; |
|
y2 = _mm_and_ps(xmm3, y2); |
|
y = _mm_andnot_ps(xmm3, y); |
|
y = _mm_add_ps(y,y2); |
|
|
|
y = _mm_xor_ps(y, sign_bit); |
|
|
|
return y; |
|
} |
|
|
|
|
|
|
|
void sincos_ps(v4sf x, v4sf *s, v4sf *c) { |
|
v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; |
|
#ifdef USE_SSE2 |
|
v4si emm0, emm2, emm4; |
|
#else |
|
v2si mm0, mm1, mm2, mm3, mm4, mm5; |
|
#endif |
|
sign_bit_sin = x; |
|
|
|
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); |
|
|
|
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask); |
|
|
|
|
|
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); |
|
|
|
#ifdef USE_SSE2 |
|
|
|
emm2 = _mm_cvttps_epi32(y); |
|
|
|
|
|
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); |
|
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); |
|
y = _mm_cvtepi32_ps(emm2); |
|
|
|
emm4 = emm2; |
|
|
|
|
|
emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); |
|
emm0 = _mm_slli_epi32(emm0, 29); |
|
v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0); |
|
|
|
|
|
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); |
|
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); |
|
v4sf poly_mask = _mm_castsi128_ps(emm2); |
|
#else |
|
|
|
xmm3 = _mm_movehl_ps(xmm3, y); |
|
mm2 = _mm_cvttps_pi32(y); |
|
mm3 = _mm_cvttps_pi32(xmm3); |
|
|
|
|
|
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); |
|
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); |
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); |
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); |
|
|
|
y = _mm_cvtpi32x2_ps(mm2, mm3); |
|
|
|
mm4 = mm2; |
|
mm5 = mm3; |
|
|
|
|
|
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4); |
|
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4); |
|
mm0 = _mm_slli_pi32(mm0, 29); |
|
mm1 = _mm_slli_pi32(mm1, 29); |
|
v4sf swap_sign_bit_sin; |
|
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); |
|
|
|
|
|
|
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); |
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); |
|
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); |
|
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); |
|
v4sf poly_mask; |
|
COPY_MM_TO_XMM(mm2, mm3, poly_mask); |
|
#endif |
|
|
|
|
|
|
|
xmm1 = *(v4sf*)_ps_minus_cephes_DP1; |
|
xmm2 = *(v4sf*)_ps_minus_cephes_DP2; |
|
xmm3 = *(v4sf*)_ps_minus_cephes_DP3; |
|
xmm1 = _mm_mul_ps(y, xmm1); |
|
xmm2 = _mm_mul_ps(y, xmm2); |
|
xmm3 = _mm_mul_ps(y, xmm3); |
|
x = _mm_add_ps(x, xmm1); |
|
x = _mm_add_ps(x, xmm2); |
|
x = _mm_add_ps(x, xmm3); |
|
|
|
#ifdef USE_SSE2 |
|
emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2); |
|
emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4); |
|
emm4 = _mm_slli_epi32(emm4, 29); |
|
v4sf sign_bit_cos = _mm_castsi128_ps(emm4); |
|
#else |
|
|
|
mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2); |
|
mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2); |
|
mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4); |
|
mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4); |
|
mm4 = _mm_slli_pi32(mm4, 29); |
|
mm5 = _mm_slli_pi32(mm5, 29); |
|
v4sf sign_bit_cos; |
|
COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); |
|
_mm_empty(); |
|
#endif |
|
|
|
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); |
|
|
|
|
|
|
|
v4sf z = _mm_mul_ps(x,x); |
|
y = *(v4sf*)_ps_coscof_p0; |
|
|
|
y = _mm_mul_ps(y, z); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); |
|
y = _mm_mul_ps(y, z); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); |
|
y = _mm_mul_ps(y, z); |
|
y = _mm_mul_ps(y, z); |
|
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); |
|
y = _mm_sub_ps(y, tmp); |
|
y = _mm_add_ps(y, *(v4sf*)_ps_1); |
|
|
|
|
|
|
|
v4sf y2 = *(v4sf*)_ps_sincof_p0; |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); |
|
y2 = _mm_mul_ps(y2, z); |
|
y2 = _mm_mul_ps(y2, x); |
|
y2 = _mm_add_ps(y2, x); |
|
|
|
|
|
xmm3 = poly_mask; |
|
v4sf ysin2 = _mm_and_ps(xmm3, y2); |
|
v4sf ysin1 = _mm_andnot_ps(xmm3, y); |
|
y2 = _mm_sub_ps(y2,ysin2); |
|
y = _mm_sub_ps(y, ysin1); |
|
|
|
xmm1 = _mm_add_ps(ysin1,ysin2); |
|
xmm2 = _mm_add_ps(y,y2); |
|
|
|
|
|
*s = _mm_xor_ps(xmm1, sign_bit_sin); |
|
*c = _mm_xor_ps(xmm2, sign_bit_cos); |
|
} |
|
|
|
|