Mathematical functions optimized using SSE instructions (finishing)

xiaoxiao2021-04-11  1.8K+

FLOAT _SSE_COS (FLOAT X) {FLOAT TEMP; __ASM {Movss XMM0, X Movs XMM1, _PS_AM_AM_INV_SIGN_MASK ANDPS XMM0, XMM1 AddSs XMM0, _PS_AM_PI_O_2 MULSS XMM0, _PS_AM_2_O_PI

CVTTS2SI ECX, XMM0 MOVSS XMM5, _PS_AM_1 MOV EDX, ECX SHL EDX, (31 - 1) CVTSI2SS XMM1, ECX and EDX, 0X80000000 and ECX, 0x1

SUBSS XMM0, XMM1 MOVSS XMM6, _SINCOS_MASKS [ECX * 4] Minss XMM0, XMM5

MovsS XMM1, _PS_SINCOS_P3 SUBSS XMM5, XMM0

Andps XMM5, XMM6 MOVSS XMM7, _PS_SINCOS_P2 andNPS XMM6, XMM0 MOV TEMP, EDX ORPS XMM5, XMM6 MOVSS XMM0, XMM5

mulss xmm5, xmm5 movss xmm4, _ps_sincos_p1 movss xmm2, xmm5 mulss xmm5, xmm1 movss xmm1, _ps_sincos_p0 addss xmm5, xmm7 mulss xmm5, xmm2 movss xmm3, temp addss xmm5, xmm4 mulss xmm5, xmm2 orps xmm0, xmm3 addss xmm5, xmm1 mulss xmm0 , XMM5 MOVSS X, XMM0

}

Return X;}

float _SSE2_cos (float x) {__asm ​​{movss xmm0, x movss xmm1, _ps_am_inv_sign_mask movss xmm2, _ps_am_pi_o_2 movss xmm3, _ps_am_2_o_pi andps xmm0, xmm1 addss xmm0, xmm2 mulss xmm0, xmm3

pxor xmm3, xmm3 movd xmm5, _epi32_1 movss xmm4, _ps_am_1 cvttps2dq xmm2, xmm0 pand xmm5, xmm2 movd xmm1, _epi32_2 pcmpeqd xmm5, xmm3 cvtdq2ps xmm6, xmm2 pand xmm2, xmm1 pslld xmm2, (31 - 1)

Subss XMM0, XMM6 MOVSS XMM3, _PS_SINCOS_P3 Minss XMM0, XMM4 SUBSS XMM4, XMM0 andPS XMM0, XMM5 Andnps XMM5, XMM4 ORPS XMM0, XMM5

movaps xmm1, xmm0 movss xmm4, _ps_sincos_p2 mulss xmm0, xmm0 movss xmm5, _ps_sincos_p1 orps xmm1, xmm2 movaps xmm7, xmm0 mulss xmm0, xmm3 movss xmm6, _ps_sincos_p0 addss xmm0, xmm4 mulss xmm0, xmm7 addss xmm0, xmm5 mulss xmm0, xmm7 addss xmm0 , XMM6 MULSS X, XMM0}

Return X;}

FLOAT _SSE_SQRT (Float X) {

Float root = 0.f; _ASM {SQRTSS XMM0, X Movss root, XMM0} Return root;

转载请注明原文地址:https://www.9cbs.com/read-133539.html

New Post(0)