UTF8 decoding algorithm in VC

xiaoxiao2021-03-06  42

In order to save network bandwidth, many web applications use UTF8 encoding methods, such as MSN, etc., UTF8 encoding principle is actually very simple, here is a VC instance below Windows.

#define unkown_char '? # Define isbyteofonebytechar (b) (((b) & 0x80) == 0x00) #define isheadoftwobyteschar (b) (((b) & 0xe0) == 0xc0) #define isheadoftHreebyteschar (b) (( (b) & 0xF0) == 0xE0) #define isComingByteOfMultBytesChar (b) (((b) & 0xC0) == 0x80) #define isHeadOfUTF8Char (b) / isByteOfOneByteChar (b) || / isByteOfOneByteChar (b) || / isHeadOfThreeBytesChar (b) Data Data :: Utf8Decode (const data & utf8hex) {int L = uTF8HEX.LENGTH (); unsigned short * unicode = new unsigned short [l * 2]; int outi = 0; for (int i = 0; i L) {Unicode [outi ] = unkown_char; break;} byte b2 = (byte) UTF8HEX.GETCHAR (i 1); if (iScomingByteofmultByteschar (b2)) {unsigned short tmp = b1 & 0x1f ; TMP = (TMP << 6) (B2 & 0x3F); Unicode [outi ] = TMP; i = 2;} else IF (isheadofutf8char (b2)) {UNI Code [outi ] = unkown_char; i = 1;} else {unicode [outi ] = unkown_char; i = 2;}} else if (isheadoftofthreebyteschar (b1)) {if (i 2> L) {Unicode [OUTI ] = Unkown_char; Break;} else if (i 3> L) {byte b2 = (byte) UTF8HEX.GETCHAR (I 1); if (ISBYTEOFONEBYTECHAR (B2)) {unicode [outi ] = unkown_char; i = 1; Continue;} else {unicode [outi ] = unkown_char; break;

}} BYTE b2 = (BYTE) utf8hex.getChar (i 1); BYTE b3 = (BYTE) utf8hex.getChar (i 2); if (isComingByteOfMultBytesChar (b2) && isComingByteOfMultBytesChar (b3)) {unsigned short tmp = b1 & 0x0f; TMP = TMP << 12; unsigned short tmp2 = b2 & 0x3f; TMP2 = TMP2 << 6; Unicode [OUTI ] = TMP TMP2 (B3 & 0x3F); i = 3;} else f (isheadofutf8char (b2)) {unicode [outi ] = unkown_char; i = 1;} else if (isheadofutf8char (b3)) {unicode [outi ] = unkown_char; i = 2;} else {unicode [outi ] = unkown_char; i = 3;}} else {unicode [outi ] = unkown_char; i = 1;}} unicode [outi] = 0; Data Ret (outi * 4, true); # ifdef win32 int count = widechartomultibyte (0, 0 , Unicode, Outi, Ret.mbuf, Outi * 4, NULL, NULL); if (count == 0) CPLOG (log_err, "get utf byte, error number =% lu", getLastError ()); # else # Endif IF (count> 0) {RET.MBUF [count] = '/ 0'; ret.mlength = count;} delete [] unicode;

Return Ret;}

转载请注明原文地址:https://www.9cbs.com/read-57520.html

New Post(0)