Fast conversion algorithm of YUV to 32-bit RGB MMX implementation

zhaozj2021-02-11  221

/// baojinlong@sohu.com

// if Have Any Problem, Contact ME.

// The asm function Nearly Have The Same Speed ​​AS MMX!

/// ASM CODE

UNSIGNED Char * CLIP;

Void init_clip () {unsigned char * p; p = (unsigned char *) malloc (2048); clip = p 1024; for (INT i = -1024; i <1024; i ) {clip [i] = (i > = 0)? ((I <= 255)? I: 255): 0;}}

// r = 1.164 (Y-16) 1.596 (V-128) // g = 1.164 (Y-16) -0.391 (U-128) -0.813 (V-128) // b = 1.164 (Y-16 2.018 (U-128)

Const static int p_1164 = 75; const static int p_1596 = 102; const static int p_0391 = 25; const static int p_0813 = 52; const static int p_2018 = 129;

Const static int ooffooff = 0x00FF00FF; const static int ffooffoo = 0xff00ff00;

Const Static Short P_223 [] = {25632, 25632}; Const Static Short P_135 [] = {4349, 4349}; const static short p_277 [] = {23906, 23906};

Void Paroll_yuv2RGB (unsigned char * y, unsigned char * v, unsigned char * r, int h, int w) {// h: Height of y matrix // w: width of y matrix // chroma type :: Must be 420

// r = 1.164 * Y 1.596 * V-223 // g = 1.164 * Y - 0.391 * U - 0.813 * V 135.9 // b = 1.164 * Y 2.018 * u -276.93

INT PY1164_20; INT PY1164_31; INT PV1596; INT PV0813; INT PU0391; INT PU2018; INT PR20, PR31, PG20, PG31, PB20, PB31;

INT RW = W << 2; int RWS16 = RW-16; int LW = W >> 2; int LH = H >> 1; int LW0 = lw; int iplip = (int) CLIP;

__ASM {MOV ESI, YLLW: MOV EDI, V

Add [V], 2

Movzx EBX, Byte PTR [EDI] Movzx Eax, Byte Ptr [EDI 1]

Mov EDI, U Add [U], 2

SHL EAX, 16 or Eax, EBX / / 00 V1 00 V0MOVZX ECX, Byte Ptr [EDI 1]

MOV EBX, EBX MUL [P_0813] SHL ECX, 16 MOV [PV0813], EAX MOV EAX, EBX MUL DWORD PTR P_1596

Movzx EBX, Byte Ptr [EDI]

MOV [PV1596], EAX

MOV EAX, ECX or EAX, EBX / / 00 U1 00 U0

MOV ECX, [ESI] // Y3 Y2 Y1 Y0

MOV EBX, EAX MUL DWORD PTR P_0391

Mov Edi, ECX

MOV [pu0391], EAX

MOV EAX, EBX MUL DWORD PTR P_2018

And ECX, OOFFOOFF // 0 Y2 0 Y0 MOV [PU2018], EAX

MOV EAX, ECX MUL [P_1164] // Y2 Y0 and EDI, FFOOFFOO // Y3 0 Y1 0 MOV [PY1164_20], EAX MOV EAX, EDI SHR EAX, 8 // 0 Y3 0 Y2

Mul [p_1164] // Y3 Y1

MOV ECX, [PV1596]

MOV EBX, [PY1164_20]

MOV EDX, DWORD PTR P_223

MOV [PY1164_31], EAX

Add Eax, ECX Add EBX, ECX SHR EAX, 1 SHR EBX, 1 Add Eax, EDX ADD EBX, EDX SHL EAX, 1 SHL EBX, 1

MOV [PR31], EAX / / R3 R1 MOV [PR20], EBX / / R2 R0

MOV ECX, [PU2018] MOV EAX, [PY1164_20] MOV EBX, [PY1164_31] MOV EDX, DWORD PTR [P_277] Add Eax, ECX Add EBX, ECX SHR EAX, 1 SHR EBX, 1 Add Eax, Edx Add EBX, EDX SHL EAX, 1 SHL EBX, 1 MOV [PB20], EAX MOV [PB31], EBX

MOV EAX, [PY1164_20] MOV EBX, [PY1164_31] MOV ECX, [PU0391] MOV EDX, [PV0813] SHR EAX, 1 SHR EBX, 1 SHR ECX, 1 SHR EBX, 1 SUB EAX, ECX SUB EBX, ECX MOV ECX , DWORD PTR [P_135] SUB EAX, EDX SUB EBX, EDX ADD EAX, ECX ADD EBX, ECX SHL EAX, 1 SHL EBX, 1 MOV [PG20], EBX MOV [PG31], EBX

// Clip and Output Mov EDI, R

Lea Edx, [PR20] MOV ECX, ICLIP MOVSX EAX, Word PTR [EDX] MOVSX EBX, Word PTR [EDX 2] SAR EAX, 6 SAR EBX, 6 Add Eax, ECX Add EBX, ECX

XOR ECX, ECX XOR EDX, EDX MOV CL, [EAX] // R0 MOV DL, [EBX] // R2 MOV [EDI 2], CL MOV [EDI 10], DLLEA EDX, [PR31] MOV ECX, iClip Movsx Eax, Word Ptr [EDX] Movsx EBX, Word PTR [EDX 2] Sar Eax, 6 Sar EBX, 6 Add Eax, ECX Add EBX, ECX

XOR ECX, ECX XOR EDX, EDX MOV CL, [EAX] // R1 MOV DL, [EBX] // R3 MOV [EDI 6], CL MOV [EDI 14], DL

Lea Edx, [PG20] MOV ECX, ICLIP MOVSX EAX, Word PTR [EDX] MOVSX EBX, Word PTR [EDX 2] Sar Eax, 6 Sar EBX, 6 Add Eax, ECX Add EBX, ECX

XOR ECX, ECX XOR EDX, EDX MOV CL, [EAX] // G0 MOV DL, [EBX] // G2 MOV [EDI 1], CL MOV [EDI 9], DL

Lea EDX, [PG31] MOV ECX, IClip Movsx EAX, Word PTR [EDX] Movsx EBX, Word PTR [EDX 2] Sar Eax, 6 Sar EBX, 6 Add Eax, ECX Add EBX, ECX

XOR ECX, ECX XOR EDX, EDX MOV CL, [EAX] // G1 MOV DL, [EBX] // G3 MOV [EDI 5], CL MOV [EDI 13], DL

Lea Edx, [PB20] MOV ECX, ICLIP MOVSX EAX, Word PTR [EDX] MOVSX EBX, Word PTR [EDX 2] SAR EAX, 6 SAR EBX, 6 Add Eax, ECX Add EBX, ECX

XOR ECX, ECX XOR EDX, EDX MOV CL, [EAX] // B0 MOV DL, [EBX] // B2 MOV [EDI], CL MOV [EDI 8], DL

Lea Edx, [PB31] MOV ECX, IClip Movsx EAX, Word PTR [EDX] Movsx EBX, Word PTR [EDX 2] SAR EAX, 6 SAR EBX, 6 Add Eax, ECX Add EBX, ECX

XOR ECX, ECX XOR EDX, EDX MOV CL, [EAX] // B1 MOV DL, [EBX] // B3

MOV EBX, RW Add ESI, W Add [R], EBX MOV EAX, [ESI] // Y3 Y2 Y1 Y0

MOV [EDI 4], CL MOV [EDI 12], DL

// next row of y MoV EBX, EAX and Eax, OoffoOff // 0 y2 0 y0 MUL [P_1164] and EBX, FFOOFFOO // Y3 0 Y1 0 SHR EBX, 8 MOV [PY1164_20], EAX MOV EAX, EBXMUL [P_1164 ]

MOV ECX, PV1596

MOV EBX, PY1164_20 MOV EDX, DWORD PTR P_223

MOV [PY1164_31], EAX

Add EAX, ECX Add EBX, ECX SHR EAX, 1 SHR EBX, 1 Add Eax, EDX ADD EBX, EDX SHL EAX, 1 SHL EBX, 1 MOV [PR31], EAX // R3 R1 MOV [PR20], EBX // R2 R0

MOV ECX, [PU2018] MOV EAX, [PY1164_20] MOV EBX, [PY1164_31] MOV EDX, DWORD PTR [P_277] Add Eax, ECX Add EBX, ECX SHR EAX, 1 SHR EBX, 1 Add Eax, Edx Add EBX, EDX SHL EAX, 1 SHL EBX, 1 MOV [PB20], EAX MOV [PB31], EBX

Mov ECX, [PU0391] MOV EAX, [PY1164_20] MOV EBX, [PY1164_31] MOV EDX, [PV0813] SHR ECX, 1 Shr EAX, 1 SHR EBX, 1 SHR EDX, 1 SUB EAX, ECX SUB EBX, ECX MOV ECX , DWORD PTR [P_135] SUB EAX, EDX SUB EBX, EDX ADD EAX, ECX ADD EBX, ECX SHL EAX, 1 SHL EBX, 1 MOV [PG20], EBX MOV [PG31], EBX

// Clip and Output Mov EDI, R

Lea Edx, [PR20] MOV ECX, ICLIP MOVSX EAX, Word PTR [EDX] MOVSX EBX, Word PTR [EDX 2] SAR EAX, 6 SAR EBX, 6 Add Eax, ECX Add EBX, ECX

XOR ECX, ECX XOR EDX, EDX MOV CL, [EAX] // R0 MOV DL, [EBX] // R2 MOV [EDI 2], CL MOV [EDI 10], DL

Lea Edx, [Pr31] MOV ECX, IClip Movsx Eax, Word PTR [EDX] Movsx EBX, Word PTR [EDX 2] SAR EAX, 6 SAR EBX, 6 Add Eax, ECX Add EBX, ECX

XOR ECX, ECX XOR EDX, EDX MOV CL, [EAX] // R1 MOV DL, [EBX] // R3 MOV [EDI 6], CL MOV [EDI 14], DL

Lea Edx, [PG20] MOV ECX, ICLIP MOVSX EAX, Word PTR [EDX] Movsx EBX, Word PTR [EDX 2] Sar Eax, 6 Sar EBX, 6 Add Eax, ECX Add EBX, ECXR ECX, ECX XOR EDX, EDX MOV CL, [EAX] // G0 MOV DL, [EBX] // G2 MOV [EDI 1], CL MOV [EDI 9], DL

Lea EDX, [PG31] MOV ECX, IClip Movsx EAX, Word PTR [EDX] Movsx EBX, Word PTR [EDX 2] Sar Eax, 6 Sar EBX, 6 Add Eax, ECX Add EBX, ECX

XOR ECX, ECX XOR EDX, EDX MOV CL, [EAX] // G1 MOV DL, [EBX] // G3 MOV [EDI 5], CL MOV [EDI 13], DL

Lea Edx, [PB20] MOV ECX, ICLIP MOVSX EAX, Word PTR [EDX] MOVSX EBX, Word PTR [EDX 2] SAR EAX, 6 SAR EBX, 6 Add Eax, ECX Add EBX, ECX

XOR ECX, ECX XOR EDX, EDX MOV CL, [EAX] // B0 MOV DL, [EBX] // B2 MOV [EDI], CL MOV [EDI 8], DL

Lea Edx, [PB31] MOV ECX, ICLIP MOVSX EAX, Word PTR [EDX] File: // B1 Movsx EBX, Word PTR [EDX 2] File: // B3 Sar Eax, 6 Sar EBX, 6 Add Eax, ECX Add EBX, ECX

XOR ECX, ECX XOR EDX, EDX MOV CL, [EAX] // B1 MOV DL, [EBX] // B3 MOV [EDI 4], CL MOV [EDI 12], DL

Mov Eax, RWS16 SUB ESI, W Add ESI, 4 SUB [R], EAX

SUB [LW], 1 JNZ LLW

MOV EAX, LW0 MOV EBX, RW

Add ESI, W Add [R], EBX

MOV [LW], EAX

SUB [LH], 1 JNZ LLW}

}

/ asm cotne end

/ MMX Code Begin

#ifdef __yuv2rgb_mul32

Const Static Short T16 [4] = {16, 16, 16, 16}; Const Static Short T128 [4] = {128, 128, 128, 128}; Const Short T1164 [4] = {4768, 4768, 4768, 4768}; Const Short T1596 [4] = {6538, 6538, 6538, 6538}; const short t0391 [4] = {1602, 1602, 1602, 1602}; const short t0813 [4] = {3330, 3330, 3330, 3330}; Const Short T2018 [4] = {8266, 8266, 8266, 8266}; // r = 1.164 (Y-16) 1.596 (V-128) // g = 1.164 (Y-16) -0.391 (U-128) - 0.813 (V-128) // b = 1.164 (Y-16) 2.018 (U-128)

#define ___0RGB

Void VideoPlayer :: YUV2RGB4XMMXC420 (unsigned char * lpy, unsigned char * lpu, unsigned char * lpv, unsigned char * lprgb, int nsrcheight, int nsrcwidth)

{INT RGBWIDTH = NSRCWIDTH << 2; // 32 BITS 0RGB; INT NYW = nsrcwidth; int col = nsrcwidth >> 3; int rot = nsrcheight >> 1;

int t1596v_128_10 [2]; int t1596v_128_32 [2]; int t0813v_128_10 [2]; int t0813v_128_32 [2]; int t0391u_128_10 [2]; int t0391u_128_32 [2]; int t2018u_128_10 [2]; int t2018u_128_32 [2];

__ASM {MOV ESI, LPU MOV EDI, LPV EAX, LPY MOV EDX, LPRGB MOV ECX, Col Mov EBX, ROW

RRR: PXOR MM0, MM0

MOVQ MM3, QWORD PTR T128 MOVD MM2, DWORD PTR [EDI] File: // 00 00 00 V3 V2 V1 V0 MOVD MM1, DWORD PTR [ESI] file: // 00 00 00 U3 U2 U0 PUNPCKLBW MM2, MM0 File: // 00 v3 00 V2 00 V1 00 V0 PUNPCKLBW MM1, MM0 File: // 00 U3 00 U2 00 U1 00 U0 PSUBSW MM1, MM3 File: // U-128 PSUBSW MM2, MM3 File: // V-128File : // compute u, v DataFile: // t0391u_128 MOVQ MM7, QWORD PTR T0391 MOVQ MM3, MM1 MOVQ MM4, MM1 Pmullw MM4, MM7 PMULHW MM3, MM7 MOVQ MM7, MM4 PUNPCKHWD MM4, MM3 File: // T0391U_128_32--> mm4 punpcklwd mm7, mm3 file: // t0391u_128_10--> mm7 movq qword ptr t0391u_128_32, mm4 movq qword ptr t0391u_128_10, mm7file: // t2018u_128 movq mm7, qword ptr t2018 movq mm3, mm1 pmullw mm1, mm7 pmulhw mm3, mm7 movq mm7 , MM1 PUNPCKHWD MM1, MM3 file: // t2018u_128_32--> mm1 punpcklwd mm7, mm3 file: // t2018u_128_10--> mm7 movq qword ptr t2018u_128_32, mm1 movq qword ptr t2018u_128_10, mm7file: // t1596v_128 movq mm7, qword ptr t1596 movq mm3, mm2 movq mm4 , MM2 Pmullw MM4, MM7 PMULHW MM3, MM7 MOVQ MM7, MM4 PUNPCKHWD MM4, MM3 File: // T1596V_128_32--> MM4 PUNPCKLWD MM7, MM3 File: // T1596V_128_10-->

mm7 movq qword ptr t1596v_128_32, mm4 movq qword ptr t1596v_128_10, mm7file: // t0813v_128 movq mm7, qword ptr t0813 movq mm3, mm2 pmullw mm2, mm7 pmulhw mm3, mm7 movq mm7, mm2 punpckhwd mm2, mm3 file: // t0813v_128_32-- > mm2 punpcklwd mm7, mm3 file: // t0813v_128_10--> mm7 movq qword ptr t0813v_128_32, mm2 movq qword ptr t0813v_128_10, mm7movq mm3, dword ptr [eax] // 76 54 32 10 pxor mm0, mm0 movq mm2, mm3 punpcklbw mm2 , MM0 // 03 02 01 00 PunpckHBW MM3, MM0 // 07 06 05 04 MOVQ MM4, MM2 MOVQ MM5, MM3 PUNPCKLWD MM2, MM0 // 00 01 00 00 00 00 PUNPCKHWD MM0, MM4 // 03 00 02 00 PXOR MM4, MM4 POR MM0, MM2 // 03 01 02 00 ---> MM0 MOVQ MM7, QWORD PTR T16 PUNPCKLWD MM3, MM4 // 00 05 00 04 PUNPCKHWD MM4, MM5 // 07 00 06 00 POR MM4, MM3 // 07 05 06 04 -> MM5 PSUBSW MM0, MM7 File: // Y-16 MOVQ MM5, MM4 PSUBSW MM5, MM7 File: // Y-16File: // compute MOVQ MM7, QWORD PTR T1164 MOVQ MM6, MM0 File: // Y3 Y1 Y2 Y0 PMULLW MM6, MM7 PMULHW MM0, MM7 MOVQ MM7, MM6 PUNPCKHWD MM7, MM0 // Y3 Y1 File: //1.164 (Y-16) - > MM7 PUNPCKLWD MM6, MM0 // Y2 Y0 File: //1.164 (Y-16) -> MM6

MOVQ MM0, QWORD PTR T1596V_128_10 MOVQ MM1, MM6 // Y2 Y0 MOVQ MM2, MM7 // Y3 Y1 // R = 1.164 (Y-16) 1.596 (V-128) PADDD MM1, MM0 // R2 R0 PADDD MM2, MM0 // R3 R1 PSRAD MM1, 12 PSRAD MM2, 12 MOVQ MM0, MM1 PUNPCKHDQ MM1, MM2 // R3 R2 PUNPCKLDQ MM0, MM2 // R1 R0 PACKSSDW MM0, MM1 // R3 R2 R1 R0 ---> MM0MOVQ MM1, Qword PTR T0391U_128_10 MOVQ MM4, QWORD PTR T0813V_128_10 // g = 1.164 (Y-16) -0.391 (U-128) -0.813 (V-128) MOVQ MM2, MM6 MOVQ MM3, MM7 PSUBD MM2, MM1 PSUBD MM3, MM1 PSubd MM2, MM4 PSUBD MM3, MM4 PSRAD MM2, 12 PSRAD MM3, 12 MOVQ MM4, MM2 MOVQ MM1, Qword PTR T2018U_128_10 PUNPCKHDQ MM2, MM3 PUNPCKLDQ MM4, MM3 PacksSDW MM4, MM2 // G3 G2 G1 G0 ---> mm4

// b = 1.164 (Y-16) 2.018 (U-128) PADDD MM6, MM1 / / B2 B0 PADDD MM7, MM1 / / B3 B1 PSRAD MM6, 12 PSRAD MM7, 12 MOVQ MM1, MM6 PUNPCKHDQ MM1, MM7 PUNPCKLDQ MM6, MM7 PXOR MM2, MM2 PackssDW MM6, MM1 / / B3 B2 B1 B0 ---> MM6

// b -> mm6, g -> mm4, r -> mm0

#ifdef ___0RGB PackusWB MM6, MM2 PackusWB MM4, MM2 PackusWB MM0, MM2 PUNPCKLBW MM6, MM4 // G3 B3 G2 B2 G1 B1 G0 B0 -> MM6 PUNPCKLBW MM0, MM2 // 00 R3 00 R2 00 r1 00 R0 -> MM0 MOVQ MM7, MM6 PUNPCKLWD MM6, MM0 // 00 R1 G1 B1 00 r0 G0 B0 PUNPCKHWD MM7, MM0 // 00 R3 G3 B3 00 R2 G2 B2 MOVQ QWORD PTR [EDX], MM6 MOVQ QWORD PTR [EDX 8], MM7 # Else Packuswb mm0, mm2 file: // r packuswb mm4, mm2 file: // g Packuswb mm6, mm2 file: // b

PUNPCKLBW MM0, MM4 / / G3 R3 ​​G2 R2 G1 R1 G0 R0 -> MM0 PUNPCKLBW MM6, MM2 // 00 B3 00 B2 00 B1 00 B0 -> MM6 MOVQ MM7, MM0PUNPCKLWD MM0, MM6 // 00 B1 G1 R1 00 R0 G0 B0 PUNPCKHWD MM7, MM6 // 00 R3 G3 B3 00 R2 G2 B2 MOVQ Qword PTR [EDX], MM0 MOVQ Qword PTR [EDX 8], MM7 # endiffile: // compute MOVQ MM7, QWORD PTR T1164 MOVQ MM6, MM5 File: // Y7 Y5 Y5 PMULLW MM6, MM7 PMULHW MM5, MM7 MOVQ MM7, MM6 PUNPCKHWD MM7, MM5 // Y7 Y5 File: //1.164 (Y-16) -> MM7 PUNPCKLWD MM6, MM5 // Y6 Y4 file: //1.164 (y-16) -> mm6

MOVQ MM0, QWORD PTR T1596V_128_32 MOVQ MM1, MM6 // Y6 Y4 MOVQ MM2, MM7 // Y7 Y5 // R = 1.164 (Y-16) 1.596 (V-128) PADDD MM1, MM0 // R2 R0 PADDD MM2, MM0 // R3 R1 PSRAD MM2, 12 PSRAD MM1, 12 MOVQ MM0, MM1 PUNPCKHDQ MM1, MM2 // R3 R2 PUNPCKLDQ MM0, MM2 // R1 R0 PACKSSDW MM0, MM1 // R3 R2 R1 R0 ---> mm0

MOVQ MM1, QWORD PTR T0391U_128_32 MOVQ MM4, QWORD PTR T0813V_128_32File: //g=1.164 (Y-16) -0.391 (U-128) -0.813 (V-128) MOVQ MM2, MM6 MOVQ MM3, MM7 PSUBD MM2, MM1 PSUBD MM3, MM1 PSUBD MM2, MM4 PSUBD MM3, MM4 PSRAD MM2, 12 PSRAD MM3, 12 MOVQ MM1, Qword PTR T2018U_128_32 MOVQ MM4, MM2 PUNPCKLDQ MM4, MM3 PacksSDW MM4, MM2 // G3 G2 G1 G0 --- > mm4

// b = 1.164 (Y-16) 2.018 (U-128) PADDD MM6, MM1 / / B2 B0 PADDD MM7, MM1 / / B3 B1 PSRAD MM6, 12 PSRAD MM7, 12 MOVQ MM1, MM6 PUNPCKHDQ MM1, MM7 PUNPCKLDQ MM6, MM7 PXOR MM2, MM2 PackssDW MM6, MM1 / / B3 B2 B1 B0 ---> MM6

// b -> mm6, g -> mm4, r -> mm0 # ifdef ___0RGB packuswb mm6, mm2 packuswb mm4, mm2 punpcklbw mm6, mm4 // g3 b3 g2 b2 g1 b1 g0 b0 -> mm6 packuswb mm0 , MM2 PUNPCKLBW MM0, MM2 // 00 R3 00 r2 00 r1 00 r0 -> mm0 MOVQ MM7, MM6 PUNPCKLWD MM6, MM0 // 00 R1 G1 B1 00 r0 G0 B0 PUNPCKHWD MM7, MM0 // 00 R3 G3 B3 00 R2 G2 B2 MOVQ QWORD PTR [EDX 16], MM6 MOVQ Qword PTR [EDX 24], MM7 # Else Packuswb MM0, MM2 File: // R Packuswb MM4, MM2 File: // g Packuswb MM6, MM2 File: // BPUNPCKLBW MM0, MM4 / / G3 R3 ​​G2 R2 G1 R1 G0 R0 -> MM0 PUNPCKLBW MM6, MM2 // 00 B3 00 B2 00 B1 00 B0 -> MM6 MOVQ MM7, MM0

PUNPCKLWD MM0, MM6 / / 00 B1 G1 R1 00 R0 G0 B0 PUNPCKHWD MM7, MM6 // 00 R3 G3 B3 00 R2 G2 B2 MOVQ QWORD PTR [EDX 16], MM0 MOVQ Qword PTR [EDX 24], MM7 # ENDIF / file: // Second Stage, Next Row of Y Add Eax, NYW Add Edx, RgbWidth

MOVQ MM3, DWORD PTR [EAX] // 76 54 32 10 PXOR MM0, MM0 MOVQ MM2, MM3 PUNPCKLBW MM2, MM0 // 03 02 01 00 PunpckHBW MM3, MM0 // 07 06 05 04 MOVQ MM4, MM2 PUNPCKLWD MM2, MM0 // 00 01 00 00 00 02 00 PXOR MM4, MM4 POR MM0, MM2 // 03 01 02 00 ---> MM0 MOVQ MM7, QWORD PTR T16 MOVQ MM5, MM3 PUNPCKLWD MM3, MM4 / / 00 05 00 04 PUNPCKHWD MM4, MM5 // 07 00 06 00 POR MM4, MM3 // 07 05 06 04 -> MM4 PSUBSW MM0, MM7 File: // Y-16 MOVQ MM5, MM4 PSUBSW MM5, MM7 FILE: // Y-16 File: // compute MOVQ MM7, QWORD PTR T1164 MOVQ MM6, MM0 File: // Y3 Y1 Y2 Y0 Pmullw MM6, MM7 PMULHW MM0, MM7 MOVQ MM7, MM6 PUNPCKHWD MM7, MM0 // Y3 Y1 File: //1.164 (Y-16) -> MM7 PUNPCKLWD MM6, MM0 // Y2 Y0 File: //1.164 (Y-16) -> MM6MOVQ MM0, QWORD PTR T1596V_128_10 MOVQ MM1, MM6 // Y2 Y0 MOVQ MM2, MM7 // Y3 Y1 // r = 1.164 (Y-16) 1.596 (V-128) PADDD MM1, MM0 // R2 R0 PADDD MM2, MM0 // R3 R1 PSRAD MM2, 12 PSRAD MM1, 12 MOVQ MM0, MM1 PUNPCKHDQ MM1, MM2 // R3 R2 PUNPCKLDQ MM0, MM2 // R1 R0 PACKSSDW MM0, MM1 // R3 R2 R1 R0 ---> MM0

MOVQ MM1, QWORD PTR T0391U_128_10 MOVQ MM4, QWORD PTR T0813V_128_10File: //g=1.164 (Y-16) -0.391 (U-128) -0.813 (V-128) MOVQ MM2, MM6 MOVQ MM3, MM7 PSUBD MM2, MM1 PSUBD MM3, MM1 PSUBD MM2, MM4 PSUBD MM3, MM4 PSRAD MM2, 12 PSRAD MM3, 12 MOVQ MM4, MM2 MOVQ MM1, QWORD PTR T2018U_128_10 PUNPCKHDQ MM2, MM3 PUNPCKLDQ MM4, MM3 PacksSSDW MM4, MM2 // G3 G2 G1 G0 --- > mm4 // b = 1.164 (Y-16) 2.018 (U-128) PADDD MM6, MM1 / / B2 B0 Paddd MM7, MM1 / / B3 B1 PSRAD MM6, 12 PSRAD MM7, 12 MOVQ MM1, MM6 PUNPCKHDQ MM1, MM7 PUNPCKLDQ MM6, MM7 PXOR MM2, MM2 Packssdw MM6, MM1 / / B3 B2 B1 B0 ---> MM6

// b -> mm6, g -> mm4, r -> mm0 # ifdef ___0RGB packuswb mm6, mm2 packuswb mm4, mm2 punpcklbw mm6, mm4 // g3 b3 g2 b2 g1 b1 g0 b0 -> mm6 packuswb mm0 , MM2 PUNPCKLBW MM0, MM2 // 00 R3 00 r2 00 r1 00 r0 -> mm0 MOVQ MM7, MM6 PUNPCKLWD MM6, MM0 // 00 R1 G1 B1 00 r0 G0 B0 PUNPCKHWD MM7, MM0 // 00 R3 G3 B3 00 R2 G2 B2 MOVQ QWORD PTR [EDX], MM6 MOVQ QWORD PTR [EDX 8], MM7 # Else Packuswb MM0, MM2 File: // R PackusWB MM4, MM2 File: //g PackusWB MM6, MM2 File: // B

PUNPCKLBW MM0, MM4 / / G3 R3 ​​G2 R2 G1 R1 G0 R0 -> MM0 PUNPCKLBW MM6, MM2 // 00 B3 00 B2 00 B1 00 B0 -> MM6 MOVQ MM7, MM0

PUNPCKLWD MM0, MM6 / / 00 B1 G1 R1 00 R0 G0 B0 PUNPCKHWD MM7, MM6 // 00 R3 G3 B3 00 R2 G2 B2 MOVQ QWORD PTR [EDX], MM0 MOVQ Qword PTR [EDX 8], MM7 # endiff: / / Compute MOVQ MM7, QWORD PTR T1164 MOVQ MM6, MM5 File: // Y7 Y5 Y6 Y4 PMULLW MM6, MM7 PMULHW MM5, MM7 MOVQ MM7, MM6 PUNPCKHWD MM7, MM5 // Y7 Y5 File: //1.164 (Y-16) -> MM7 PUNPCKLWD MM6, MM5 // Y6 Y4 File: //1.164 (Y-16) -> MM6MOVQ MM0, QWORD PTR T1596V_128_32 MOVQ MM1, MM6 // Y6 Y4 MOVQ MM2, MM7 // Y7 Y5 // R = 1.164 (Y-16) 1.596 (V-128) PADDD MM1, MM0 // R2 R0 PADDD MM2, MM0 // R3 R1 PSRAD MM1, 12 PSRAD MM2, 12 MOVQ MM0, MM1 PUNPCKHDQ MM1, MM2 // R3 R2 PUNPCKLDQ MM0, MM2 // R1 R0 PackssDW MM0, MM1 // R3 R2 R1 R0 ---> MM0

MOVQ MM1, QWORD PTR T0391U_128_32 MOVQ MM4, QWORD PTR T0813V_128_32File: //g=1.164 (Y-16) -0.391 (U-128) -0.813 (V-128) MOVQ MM2, MM6 MOVQ MM3, MM7 PSUBD MM2, MM1 PSUBD MM3, MM1 PSUBD MM2, MM4 PSUBD MM3, MM4 PSRAD MM2, 12 PSRAD MM3, 12 MOVQ MM1, Qword PTR T2018U_128_32 MOVQ MM4, MM2 PUNPCKLDQ MM4, MM3 PacksSDW MM4, MM2 // G3 G2 G1 G0 --- > mm4

// b = 1.164 (Y-16) 2.018 (U-128) PADDD MM6, MM1 / / B2 B0 PADDD MM7, MM1 / / B3 B1 PSRAD MM6, 12 PSRAD MM7, 12 MOVQ MM1, MM6 PUNPCKHDQ MM1, MM7 PUNPCKLDQ MM6, MM7 PXOR MM2, MM2 PackssDW MM6, MM1 / / B3 B2 B1 B0 ---> MM6

// b -> mm6, g -> mm4, r -> mm0 # ifdef ___0RGB packuswb mm6, mm2 packuswb mm4, mm2 punpcklbw mm6, mm4 // g3 b3 g2 b2 g1 b1 g0 b0 -> mm6 packuswb mm0 , MM2 PUNPCKLBW MM0, MM2 // 00 R3 00 r2 00 r1 00 r0 -> mm0 MOVQ MM7, MM6 PUNPCKLWD MM6, MM0 // 00 R1 G1 B1 00 r0 G0 B0 PUNPCKHWD MM7, MM0 // 00 R3 G3 B3 00 R2 G2 B2 MOVQ QWORD PTR [EDX 16], MM6 MOVQ Qword PTR [EDX 24], MM7 # Else Packuswb MM0, MM2 File: // R Packuswb MM4, MM2 File: // g Packuswb MM6, MM2 File: // BPUNPCKLBW MM0, MM4 / / G3 R3 ​​G2 R2 G1 R1 G0 R0 -> MM0 PUNPCKLBW MM6, MM2 // 00 B3 00 B2 00 B1 00 B0 -> MM6 MOVQ MM7, MM0

PUNPCKLWD MM0, MM6 / / 00 B1 G1 R1 00 R0 G0 B0 PUNPCKHWD MM7, MM6 // 00 R3 G3 B3 00 R2 G2 B2 MOVQ QWORD PTR [EDX 16], MM0 MOVQ Qword PTR [EDX 24], MM7 # ENDIF Sub Eax, NYW SUB EDX, RGBWIDTH ADD ESI, 4 Add EDI, 4 Add Eax, 8 Add Edx, 32 DEC ECX JNZ RRR

MOV ECX, COL Add Eax, NYW Add Edx, Rgbwidth Dec EBX JNZ RRR EMMS}}

#ELSE

Short T1164 [4] = {19071, 19071, 19071, 19071 // << 2}; Short T1596 [4] = {26149, 26149, 26149, 26149 // << 2}; short t0391 [4] = {25625 , 25625, 25625, 25625 // << 0}; short t0813 [4] = {26641, 26641, 26641, 26641 // << 1}; Short T2018 [4] = {16532, 16532, 16532, 16532 //// << 3}; Short T16 [4] = {16, 16, 16, 16}; SHORT T128 [4] = {128, 128, 128, 128}

void VideoPlayer :: yuv2rgb4XmmxC420 (unsigned char * lpY, unsigned char * lpU, unsigned char * lpV, unsigned char * lpRGB, int nSrcHeight, int nSrcWidth) {int rgbwidth = nSrcWidth << 2; // 32 bits rgb0; int nyw = NsrcWidth; int col = nsrcwidth >> 3; int = nsrcheight >> 1; #define mmt2018u mm1 #define mmt0813v mm2 #define mmt0391u mm3 #define mmt1596v mm4

__INT64 TY;

__ASM {MOV ESI, LPU MOV EDI, LPV EAX, LPY MOV EDX, LPRGB MOV ECX, Col Mov EBX, ROW

RRR: PXOR MM0, MM0

MOVQ MM3, QWORD PTR T128 MOVQ MM4, QWORD PTR T0391 MOVQ MM5, QWORD PTR T2018 MOVQ MM6, Qword PTR T1596 MOVQ MM7, Qword PTR T0813

MOVD MM1, DWORD PTR [ESI] MOVD MM2, DWORD PTR [EDI] PUNPCKLBW MM1, MM0 PUNPCKLBW MM2, MM0

File: // Copute U, V PSUBSW MM1, MM3 File: // U-128 PSUBSW MM2, MM3 File: // V-128 MOVQ MM3, MM1 PSLLW MM1, 3 Pmulhw MM3, MM4 // T0391U -> MM3 Pmulhw MM1, MM5 / / T2018U -> MM1 MOVQ MM4, MM2 PSLLW MM2, 1 PSLLW MM4, 2 Pmulhw MM2, MM7 // T0813V -> MM2 PMULHW MM4, MM6 // T1596V -> MM4

MOVQ MM5, DWORD PTR [EAX] // 76 54 32 10 PXOR MM0, MM0 MOVQ MM6, MM5 PUNPCKLBW MM5, MM0 // 03 02 01 00 PunpckHBW MM0, MM6 // 70 60 50 40 POR MM0, MM5 // 73 62 51 40 PXOR MM6, MM6 PXOR MM5, MM5 PUNPCKHBW MM6, MM0 // 70 30 60 20 PUNPCKLBW MM0, MM5 // 05 01 04 00 POR MM0, MM6 // 75 31 64 20 PXOR MM5, MM5 MOVQ MM6, MM0 PUNPCKHBW MM6 , MM5 // Y7 Y5 Y3 Y1 PUNPCKLBW MM0, MM5 MOVQ MM5, QWORD PTR T16 MOVQ MM7, QWORD PTR T1164 PSUBSW MM6, MM5 PSUBSW MM0, MM5 PSLLW MM6, 2 PSLLW MM0, 2 Pmulhw MM6, MM7 PMulHW MM0, MM7 // Y6 Y2 Y2 Y0 -> MM0 MOVQ QWORD PTR TY, MM6 // Y7 Y5 Y3 Y1 -> TY File: // compute PXOR MM7, MM7 MOVQ MM5, MMT1596V MOVQ MM6, MM0 F ILE: // Copy 1.164 (Y-16) Paddsw MM5, MM0 // r = 1.164 (Y-16) 1.596 (V-128) R -> MM5 PSUBSW MM6, MMT0391U File: //1.164 (Y-16 ) -0.391 (U-128) PSUBSW MM6, MMT0813V // g = 1.164 (Y-16) -0.391 (U-128) -0.813 (V-128) g -> mm6 paddsw mm0, mmt2018 u // b = 1.164 (Y-16) 2.018 (U-128) B -> MM0

// b -> mm0, g -> mm6, r -> mm5 packuswb mm6, mm7 packuswb mm0, mm7 PUNPCKLBW MM0, MM6 / / G3 B3 G2 B2 G1 B1 G0 B0 -> MM0 PackusWB MM5, MM7 PUNPCKLBW MM5, MM7 // 00 r3 00 r2 00 r1 00 r0 -> mm5 MOVQ MM7, MM0 PUNPCKLWD MM0, MM5 // 00 R1 G1 B1 00 r0 G0 B0 PUNPCKHWD MM7, MM5 // 00 R3 G3 B3 00 R2 G2 B2 MOVQ Qword PTR [EDX], MM0 MOVQ MM0, QWORD PTR TY MOVQ QWORD PTR [EDX 8], MM7File: // Compute PXOR MM7, MM7 MOVQ MM5, MMT1596V Paddsw MM5, MM0 // R = 1.164 (Y-16) 1.596 (V-128) R -> MM5 MOVQ MM6, MM0 File: // Copy 1.164 (Y-16) PSUBSW MM6, MMT0391U File: //1.164 (Y-16) -0.391 (U-128) PSUBSW MM6, MMT0813V / / G = 1.164 (Y-16) -0.391 (U-128) -0.813 (V-128) g -> mm6 paddsw mm0, MMT2018U // b = 1.164 (Y-16) 2.018 (U-128 ) B -> mm0

// b -> mm0, g -> mm6, r -> mm5 packuswb mm6, mm7 packuswb mm0, mm7 PUNPCKLBW MM0, MM6 / / G3 B3 G2 B2 G1 B1 G0 B0 -> MM0 PackusWB MM5, MM7 PUNPCKLBW MM5, MM7 / / 00 r3 00 r2 00 r1 00 r0 -> mm5 MOVQ MM7, MM0 MOVQ MM6, [EDX] // 2 0

PUNPCKLWD MM0, MM5 / / 00 R1 G1 B1 00 R0 G0 B0 PUNPCKHWD MM7, MM5 // 00 R3 G3 B3 00 R2 G2 B2

MOVQ MM5, MM6 PUNPCKLDQ MM6, MM0 // 1 0 PUNPCKHDQ MM5, MM0 // 3 2 MOVQ MM0, [EDX 8] // 4 6 MOVQ [EDX], MM6 MOVQ [EDX 8], MM5 MOVQ MM6, MM0 PUNPCKHDQ MM0, MM7 / / 7 6 PUNPCKLDQ MM6, MM7 / / 5 4 MOVQ [EDX 24], MM0 MOVQ [EDX 16], MM6

File: // Next Row of Y Add Eax, NYW Add Edx, Rgbwidth

MOVQ MM5, DWORD PTR [EAX] // 76 54 32 10 PXOR MM0, MM0 MOVQ MM6, MM5 PUNPCKLBW MM5, MM0 // 03 02 01 00 PunpckHBW MM0, MM6 // 70 60 50 40 POR MM0, MM5 // 73 62 51 40 PXOR MM6, MM6 PXOR MM5, MM5 PUNPCKHBW MM6, MM0 // 70 30 60 20 PUNPCKLBW MM0, MM5 // 05 01 04 00 POR MM0, MM6 // 75 31 64 20 PXOR MM5, MM5 MOVQ MM6, MM0 PUNPCKHBW MM6 , MM5 // Y7 Y5 Y3 Y1 PUNPCKLBW MM0, MM5 MOVQ MM5, QWORD PTR T16 MOVQ MM7, QWORD PTR T1164 PSUBSW MM6, MM5 PSUBSW MM0, MM5 PSLLW MM6, 2 PSLLW MM0, 2 Pmulhw MM6, MM7 PMulHW MM0, MM7 // Y6 Y4 Y2 Y0 -> MM0 MOVQ QWORD PTR TY, MM6 // Y7 Y5 Y3 Y1 -> TY File: // compute PXOR MM7, MM7 MOVQ MM5, MMT1596V Paddsw MM5, MM0 // R = 1.164 (Y-16 1.596 (V-128) R -> MM5 MOVQ MM6, MM0 File: // Copy 1.164 (Y-16) PSUBSW MM6, MMT0391U File: //1.164 (Y-16) -0.391 (U-128) PSUBSW MM6, MMT0813V / / G = 1.164 (Y-16) -0.391 (U-128) -0.813 (V-128) g -> mm6 paddsw mm0, MMT2018U // b = 1.164 (Y-16) 2.018 (U -128) b -> mm0

// b -> mm0, g -> mm6, r -> mm5 packuswb mm6, mm7 packuswb mm0, mm7 PUNPCKLBW MM0, MM6 / / G3 B3 G2 B2 G1 B1 G0 B0 -> MM0 PackusWB MM5, MM7 PUNPCKLBW MM5, MM7 // 00 r3 00 r2 00 r1 00 r0 -> mm5 MOVQ MM7, MM0 PUNPCKLWD MM0, MM5 // 00 R1 G1 B1 00 r0 G0 B0 PUNPCKHWD MM7, MM5 // 00 R3 G3 B3 00 R2 G2 B2 MOVQ Qword PTR [EDX], MM0 MOVQ QWORD PTR [EDX 8], MM7File: // compute MOVQ MM0, QWORD PTR TY PXOR MM7, MM7 MOVQ MM5, MMT1596V Paddsw MM5, MM0 // R = 1.164 (Y-16) 1.596 (V-128) R -> MM5 MOVQ MM6, MM0 File: // Copy 1.164 (Y-16) PSUBSW MM6, MMT0391U File: //1.164 (Y-16) -0.391 (U-128) PSUBSW MM6, MMT0813V / / G = 1.164 (Y-16) -0.391 (U-128) -0.813 (V-128) g -> mm6 paddsw mm0, MMT2018U // b = 1.164 (Y-16) 2.018 (U-128 ) B -> mm0

// b -> mm0, g -> mm6, r -> mm5 packuswb mm6, mm7 packuswb mm0, mm7 PUNPCKLBW MM0, MM6 / / G3 B3 G2 B2 G1 B1 G0 B0 -> MM0 PackusWB MM5, MM7 PUNPCKLBW MM5, MM7 / / 00 R3 00 r2 00 r1 00 r0 -> mm5 MOVQ MM7, MM0

MOVQ MM6, [EDX] // 2 0

PUNPCKLWD MM0, MM5 / / 00 R1 G1 B1 00 R0 G0 B0 PUNPCKHWD MM7, MM5 // 00 R3 G3 B3 00 R2 G2 B2

MOVQ MM5, MM6 PUNPCKLDQ MM6, MM0 // 1 0 PUNPCKHDQ MM5, MM0 // 3 2 MOVQ MM0, [EDX 8] // 4 6 MOVQ [EDX], MM6 MOVQ [EDX 8], MM5 MOVQ MM6, MM0 PUNPCKHDQ MM0, MM7 / / 7 6 PUNPCKLDQ MM6, MM7 // 5 4 MOVQ [EDX 24], MM0 MOVQ [EDX 16], MM6 Sub ESI, NYW SUB EDX, RGBWIDTH ADD ESI, 4 Add EDI, 4 Add Eax , 8 Add Edx, 32 DEC ECX JNZ RRRMOV ECX, COL Add Eax, NYW Add Edx, RgbWidth Dec EBX JNZ RRR EMMS}} # ENDIF

转载请注明原文地址:https://www.9cbs.com/read-5523.html

New Post(0)