Can be the fastest algorithm Alpha Blend assembly source code

zhaozj2021-02-16 169

Intel's official website has an Aablend_565 fast adobe algorithm. It is theoretically on a 32bit RGBA to the 16bit buffer. My machine is PIII800, the function is performed in System Menory, 640 * 480 256 Alpha Blending, reaching 100FPS I want to meet the demands of most, here, I provide this algorithm application, I hope I can help everyone. Aablend_565 function, source code can be compiled directly, no other library functions, thank INTEL provides such a good thing.

First, I offer some of my own function file that reads 32bit TGA files in prgbabuffer in width, Height // ------------------------------------------------------------------------------------------------------------ ---------------------------------------------- // name : Loadingtgafile (tchar * strpathname, dword ** prgbabuffer, long * width, long * height) // DESC: Read 32bit TGA files to DWORD buffer, return to its size // Time: 2002.06.22 00: 36 // Author : RealRender // Para: // Return: // Note: This code comes from D3DTextr.cpp in DirectX 7.0 Sample, I extracted him out // Easy to use // --------- -------------------------------------------------- ---------- Bool Loadtgafile (tchar * strpathname, dword ** prgbabuffer, long * width, long * height) {file * file = fopen (strpathname, "rb"); if (null == file ) return false; struct TargaHeader {BYTE IDLength; BYTE ColormapType; BYTE ImageType; BYTE ColormapSpecification [5]; WORD XOrigin; WORD YOrigin; WORD ImageWidth; WORD ImageHeight; BYTE PixelDepth; BYTE ImageDescriptor;} tga; fread (& tga, sizeof (TargaHeader ), 1, file); // only true color, non-mapped images area supportedif ((0! = Tga.colormaptype) || (TGA.ImageType! = 10 && tga.imageType! = 2) {fclose ( file); return false;} // Skip the ID field The first byte of the header is the length of this fieldif (tga.IDLength) fseek (file, tga.IDLength, SEEK_CUR);. DWORD m_dwWidth = tga.ImageWidth; DWORD m_dwHeight = tga.ImageHeight; DWORD m_dwBPP = tga.PixelDepth; DWORD * m_pRGBAData = new DWORD [m_dwWidth * m_dwHeight]; if (m_pRGBAData == NULL) {fclose (file); return false;} for (DWORD y = 0; y {DWORD DWOFFSET = Y * m_dwwidth; if (0 == (TGA.ImageDescriptor & 0x0010)) DWOFSET = (m_dwheight-y-1) * m_dwwidth; for (DWORD X = 0; x {IF (tga.imageType == 10 ) {Byte packetinfo = getc; word packettype = 0x80 & packetInfo;

Word PixelCount = (0x007F & PacketInfo) 1; if (packettype) {DWORD B = Getc (file); DWORD G = Getc (file); DWORD R = Getc (file); DWORD A = 0xff; if (m_dwbpp == 32) a = getc (file); while (pixelcount--) {m_Prgbadata [dwoffset x] = (r << 24L) (g << 16L) (B << 8L) (a); x ; }}} else {while (pixelcount--) {byte b = getc (file); byte g = getc (file); byte r = getc (file); byte a = 0xff; if (m_dwbpp == 32) a = Getc (file); m_Prgbadata [dwoffset x] = (r << 24L) (g << 16L) (B << 8L) (a); x ;}}} else {byte b}}} else {byte b = getc (file) Byte g = getc (file); byte r = getc (file); byte a = 0xff; if (m_dwbpp == 32) a = getc (file); m_Prgbadata [dwoffset x] = (r << 24L) (g << 16L) (B << 8L) (a); x ;}}} fclose (file); // Check for alpha contentfor (dword i = 0; i <(m_dwwidth * m_dwheight); i ) {if (m_pRGBAData [i] & 0x000000ff = 0xff!) {// m_bHasAlpha = TRUE; break;}} * pRGBABuffer = m_pRGBAData; * width = m_dwWidth; * height = m_dwHeight; return true;} the 32bit buffer and divided into rgb Alpha's code.

Note that the divided Pbitmap must be an 8-byte alignment, which is an important condition for optimization, so in my algorithm: byte * p = new byte [lsize * 2 8]; byte * porig = p; p = (DWORD) P% 8; Word * Color = (Word *) P; this is an irregular writing method, and the pointer is forced to 8 ordered. When actually use, the original pointer to remember is not p, but Is Porig, here, I didn't release the allocated memory, please forgive me. / / -------------------------------------------------------------------------------------------- ----------------------- // name: splitrgba (dword * prgabuffer, lpbyte * palpha, lpword * Pbitmap, long lwidth, long lheight) // DESC : // Time: 2002.06.22 00: 36 // Author: RealRender // Para: // Return: // Note: Set up 16bit 565 buffer and 8bit's alpha channel // ----- -------------------------------------------------- ---------------- Void splitrgba (dword * prgbabuffer, lpbyte * palpha, lpword * Pbitmap, long lwidth, long lheight) {long lsize = lwidth * Lheight; BYTE * alpha = New Byte * p = new byte [lsize * 2 8]; // forcibly converted to 8-byte align P = (DWORD) P% 8; Word * Color = (Word *) P; DWORD DWPIXEL DWORD R, G, B, A; For (int i = 0; i > 24) & 0x000000FF); g = (DWPixel >> 16) & 0x000000FF); b = ((dwpixel >> 8) & 0x000000FF); A = ((dwpixel >> 0) & 0x000000FF); alpha [i] = a; // 888i convert to 565Color [i] = RGBTO16 (R, g, b);} * Palpha = alpha; * Pbitmap = color;} // This function of the INTEL official, the description of the function, using me, is drawing a 565 color data with a 256 alpha channel To the 16-bit target page. Function Description:

Unsigned char * lpalpha, // 256

level

alpha

aisle

Unsigned int centpitch, // alpha

Channel

Pitchunsigned char * lpsrc, //

Original color buffer

Unsigned int isrcx, // unsigned int isrcy, //

Original color location

Unsigned int isrcpitch, //

Original color

Pitchunsigned char * lpdst, //

Target buffer

Unsigned int idstx, unsigned int idsty, //

target location

Unsigned int idstw, unsigned int IDsth, // target buffer size

Unsigned int idstpitch //

Target buffer

pitchvoid ablend_565 (unsigned char * lpAlpha, unsigned int iAlpPitch, unsigned char * lpSrc, unsigned int iSrcX, unsigned int iSrcY, unsigned int iSrcPitch, unsigned char * lpDst, unsigned int iDstX, unsigned int iDstY, unsigned int iDstW, unsigned int iDstH, unsigned int iDstPitch) {// Mask for isolating the red, green, and blue componentsstatic __int64 MASKB = 0x001F001F001F001F; static __int64 MASKG = 0x07E007E007E007E0; static __int64 MASKSHIFTG = 0x03F003F003F003F0; static __int64 MASKR = 0xF800F800F800F800; // constants used by the integer alpha blending equationstatic __int64 SIXTEEN = 0x0010001000100010; static __int64 FIVETWELVE = 0x0200020002000200; static __int64 SIXONES = 0x003F003F003F003F; unsigned char * lpLinearDstBp = (iDstX << 1) (iDstY * iDstPitch) lpDst; // base pointer for linear destinationunsigned char * lpLinearSrcBp = ( iSrcX << 1) (iSrcY * iSrcPitch) lpSrc; // base pointer for linear sourceunsigned char * lpLinearAlpBp = iSrcX (iSrcY * iAlpPitch) lpAlpha; // base pointer for linear alpha_asm {mov esi, lpLinearSrcBp; // srcmov edi, lpLinearDstBp; // dstmov eax, lpLinearAlpBp; // alphamov ecx, iDstH; // ecx = number of lines to copymov ebx, iDstW; // ebx = span width to copytest esi, 6; // check if source address Is Qword Aligned // Since Addr Coming In is always word aligned (16bit) jnz done; // if not qword aligned we don't do anythingPrimeloop: MOVD MM1, [EAX]; // mm1 = 00 00 00 A3 A2 A1 A0PXOR MM2, MM2; // mm2 = 0; MOVQ MM4, [ESI]; // G1: MM4 = SRC3 SRC2 SRC1 SRC0PUNPCKLBW MM1, MM2; // mm1 = 00a3 00a2 00a1 00a0loopqword: MOV EDX, [EAX]; test EBX , 0xfffffffc; // Check if only 3 Pixels Leftjz Checkback; // 3 or Less Pixels Left // Early Out Testscmp Edx, 0xffffff

// test for alpha value of 1je copyback; // if 1's copy the source pixels to the destinationtest edx, 0xffffffff; // test for alpha value of 0jz leavefront; // if so go to the next 4 pixels // the alpha blend Starts // Green // I = A * SG (63-A) * DG; // i = (i 32) ((i 32) >> 6) >> 6; // RED // i = A * SR (31-A) * DR; // i = (i 16) ((i 16) >> 5) >> 5; MOVQ MM5, [EDI]; // g2: mm5 = DST3 DST2 DST1 DST0PSRLW MM1, 2; // mm1 = a? >> 2 Nuke Out Lower 2 Bitsmovq MM7, MASKSHIFTG; / / G3: MM7 = 1 Bit Shifted Green Maskpsrlw MM4, 1; // G3a: Move Src Green Down by 1 SO That We Won't overflowmovq mm0, mm1; // mm0 = 00a3 00a200a1 00a0psrLW mm5, 1; // g3b: Move Dst Green Down by 1 SO That We Worn '1 er' s m m '' 'er er b > 1 Nuke Out Lower 1 Bitspand MM4, MM7; // G5: MM4 = SG3 SG2 SG1 SG0MOVQ MM2, SIXONES; // G4: MM2 = 63pand MM5, MM7; // G7: MM5 = DG3 DG2 DG1 DG0MOVQ MM3, [ESI ]; // b1: mm3 = SRC3 SRC2 SRC1 SRC0PSUBSB MM2, MM0; // G6: MM2 = 63-A3 63-A2 63-A1 63-A0MOVQ MM7, MASKB; // B2: MM7 = Blue Maskpmullw MM4, MM0; // G8: mm4 = SG? * a? MOVQ MM0, [EDI]; // b3: mm0 = DST3 DST2 DST1 DST0PMULLW MM5, MM2; // G9: MM5 = DG? * (1-a?) MOVQ MM2, MM7; // B4: mm2 = FiveOnSpand mm3, mm7; // b4: mm3 = SB3 SB2 SB1 SB0Pmullw mm3, mm1; / / b6: mm3 = SB? * a? PAND MM0, MM7; // B5: MM0 = DB3 DB2 DB1 DB0MOVQ MM7, [ESI]; // R1: MM7 = SRC3 SRC2 SRC1 SRC0PADDW MM4, MM5; // G10: MM4 = SG? * a? DG? * (1-a?) Pand mm7, maskr; // r2: mm7 = SR3 SR2 SR1 SR0

PSUBSB mm2, mm1; // b5a: mm2 = 31-A3 31-A2 31-A1 31-A0PADDW MM4, FiveTwelve; // G11: mm4 = (mm4 512) Greenpmullw MM0, MM2; // B7: MM0 = DB ? * (1-a?) MOVQ MM5, MM4; // G12: mm5 = mm4 Greenpsrlw mm7 ,11; // r4: shift src red down to position 0PSRLW mm4, 6; // g13: mm4 = mm4 >> 6paddw MM4, MM5; // G14: MM4 = MM4 MM5 GreenpaddW MM0, MM3; // B8: MM0 = SB? * a? DB? * (1-a?) MOVQ MM5, [EDI]; // R3: MM5 = DST3 DST2 DST1 DST0PADDW MM0, Sixteen; // B9: MM0 = (MM0 16) Bluepand MM5, Maskr; // R5: MM5 = DR3 DR2 DR1 DR0PSRLW MM4, 5; // G15: MM4 = 0? G0 0 ? G0 0? G0 0? G0 Greenmovq mm3, mm0; // b10: mm3 = mm0 bluepsrlw mm0, 5; // b11: mm0 = mm0 >> 5 Bluepsrlw mm5 ,11; // r6: shift dst red down to position 0paddw mm0, mm3; // b12: mm0 = mm3 mm0 bluepsrlw mm0, 5; // b13: mm0 = 000B 000B 000B 000B BluePmullw mm7, mm1; // mm7 = SR? * A? PAND MM4, Maskg; // G16: mm4 = 00g0 00g0 00g0 00 g0 Greenpmullw mm5, mm2; // r7: mm5 = DR? * (31-a?) POR MM0, MM4; // mm0 = 00GB 00GB 00GB 00GBADD EAX, 4; // move to next 4 alphasadd esi, 8; //move to next 4 Pixels in Srcadd EDI, 8; // Move To Next 4 Pixels in Dstmovd MM1, [EAX]; // mm1 = 00 00 00 00 A2 A1 A0PADDW MM5, MM7; // R8: MM5 = SR? * a? DR? * (31-a?) Paddw mm5, sixteen; // r9: mm5 = (mm5 16) REDPXOR MM2, MM2; // mm2 = 0; MOVQ MM7, MM5; // R10: MM7 = MM5 REDPSRLW MM5, 5; // R11: MM5 = MM5 >> 5 RedMoVQ MM4, [ESI]; // G1: MM4 = SRC3 SRC2 SRC1 SRC0PADDW MM5, MM7; // R12: MM5 = MM7 MM5 REDPUNPCKLBW MM1, MM2; // MM1 = 00a300A2 00A1 00A0PSRLW MM5, 5; // R13: mm5 = mm5 >> 5 Redpsllw MM5 11; // r14: mm5 = mm5 << 10 redpor mm0, mm5; // mm0 = 0RGB 0RGB 0RGB 0RGBSUB EBX, 4; // Polished Off 4 PixelsMovq [EDI-8], MM0; // DST = 0RGB 0RGB 0rgb 0rgbjmp loopqword;

// Go Back to Startcopyback: MOVQ [EDI], MM4; // Copy Source To DestinationleFront: Add Edi, 8; // Advance Destination By 4 PixelsAdd Eax, 4; // Advance Alpha by 4Add ESI, 8; // Advance source by 4 pixelssub ebx, 4; // decrease pixel count by 4jmp primeloop; checkback: test ebx, 0xFF; // check if 0 pixels leftjz nextline; // done with this span // backalign: // work out back end pixelsmovq MM5, [EDI]; // G2: MM5 = DST3 DST2 DST1 DST0PSRLW MM1, 2; // mm1 = a? >> 2 Nuke Out Lower 2 Bitsmovq MM7, Maskshiftg; // G3: MM7 = Shift 1 Bit Green Maskpsrlw MM4 , 1; // G3a: Move Src Green Down by 1 SO That We Won't overflowmovqq MM0, MM1; // MM0 = 00a3 00a2 00a1 00a0PRLW MM5, 1; // G3B: Move Dst Green Down by 1 SO That We Won 't overflowpsrlw mm1,1; // mm1 = a? >> 1 Nuke Out Lower 1 Bitspand MM4, MM7; // G5: MM4 = SG3 SG2 SG1 SG0MOVQ MM2, SIXONES; // G4: MM2 = 63pand mm5, mm7; // g7: mm5 = DG3 DG2 DG1 DG0MOVQ MM3, [ESI]; // b1: mm3 = src3 src2 src1 src0psubsb mm2, mm0; // g6: mm2 = 63-A3 63-A2 63-A1 63-A0MOVQ MM7, Maskb; // b2: mm7 = blue maskpmullw mm4, mm0; // g8: mm4 = SG? * a? MOVQ MM0, [EDI]; // B3: MM0 = DST3 DST2 DST1 DST0Pmullw MM5, MM2; // G9: MM5 = DG? * (1-a?) MOVQ MM2, MM7; // B4 : mm2 = FIVEONESPAND MM3, MM7; // B4: MM3 = SR3 SR2 SR1 SR0Pmullw MM3, MM1; // B6: MM3 = SB? * a? Pand MM0, MM7; // B5: MM0 = DB3 DB2 DB1 DB0MOVQ MM7, [ESI]; // r1: mm7 = SRC3 SRC2 SRC1 SRC0PADDW MM4, MM5; // G10: MM4 = SG? * a? DG? * (1-a?) PAND MM7, MASKR; // R2: MM7 = SR3 SR2 SR1 SR0PSUBSB mm2, mm1; // b5a: mm2 = 31-A3 31-A2 31-A1 31-A0PADDW MM4, FiveTwelve; // G11: MM4 = (i 512) Greenpmullw MM0, MM2; // b7: MM0 = DB? * (1-a?) MOVQ MM5, MM4; // G12: MM5 = (i 512) Greenpsrlw mm7,11; // r4: shift src red down to position 0PSRLW MM4, 6; // g13 : mm4 = (i

512) >> 6Paddw mm4, mm5; // g14: mm4 = (i 512) ((i 512) >> 6) Greenpaddw mm0, mm3; // b8: mm0 = Sb? * A? DB? * (1-a?) MOVQ MM5, [EDI]; // R3: mm5 = DST3 DST2 DST1 DST0PADDW MM0, Sixteen; // B9: MM0 = (i 16) BluePand MM5, MASKR; // R5: mm5 = DR3 DR2 DR1 DR0PSRLW MM4, 5; // G15: MM4 = 0? G0 0? G0 0? G0 0? G0 Greenmovq MM3, MM0; // B10: MM3 = (i 16) BluepsRLW MM0, 5; // B11 : mm0 = (i 16) >> 5 Bluepsrlw mm5 ,11; // r6: shift dst red down to position 0paddw mm0, mm3; // b12: mm0 = (i 16) (i 16) >> 5 BluePSRLW MM0, 5; // B13: MM0 = 000r 000r 000r 000R bluepmullw mm7, mm1; // mm7 = SR? * A? Pand MM4, MASKG; // G16: MM4 = 00g0 00g0 00g0 GreenPmullw MM5, MM2; // r7: mm5 = DR? * (31-a?) POR MM0, MM4; // mm0 = 00GB 00GB 00GB 00GBADD EAX, 4; // Move to next 4 alphas // stallpaddw mm5, mm7; // r8: MM5 = SR? * a? DR? * (31-a?) Paddw MM5, Sixteen; // R9: mm5 = (i 16) REDMOVQ MM7, MM5; // R10: mm7 = (i 16) redpsrlw MM5, 5; // R11: mm5 = (i 16) >> 5 redpaddw mm5, mm7; // r12: mm5 = (i 16) ((i 16) >> 5) REDPSRLW MM5, 5; // R13: mm5 = (i 16) ((i 16) >> 5) >> 5 redpsllw mm5 ,11; // r14: mm5 = mm5 << 10 redpor mm0, mm5; // mm0 = 0 RGB 0RGB 0RGB 0RGbTest EBX, 2; // Check if The are 2 Pixelsjz OneEndpixel; // Goto One Pixel IF That's itMOVD [EDI], MM0; // DST = 0000 0000 0RGB 0RGBPSRLQ MM0, 32; // MM0 >> 32Add EDI, 4; // EDI = EDI 4SUB EBX, 2; // Saved 2 Pixelsjz Next Linenendpixel: // Work On Last Pixelmovd EDX, MM0; // EDX = 0RGBMOV [EDI], DX ; // DST = 0RGBNextLine: // goto next linedec ECX; // Nuke One linejz done; // all Donemov Eax, LPLINEARALPBP; // Alphamov ESI, LPLINEARSRCBP; // Srcmov Edi, LPLineardStbp;

转载请注明原文地址:https://www.9cbs.com/read-28203.html

9cbs

New Post(0)