Simple UTF-8 C Decoder

December 23, 2008

This is a simple C or C++ code for UTF-8 decoder

// given that this is first byte of the character,
// how many bytes is the character occupy?
int NumberOfUTF8Chars(unsigned char ch)
{
if (ch < 0x80u) return 1; else if (ch < 0xE0u) return 2; else if (ch < 0xF0u) return 3; else if (ch < 0xF8u) return 4; else if (ch < 0xFCu) return 5; else return 6; } // given that this is first byte of the character, // what is the code value of that character? unsigned int ValueOfUTF8Code(const char* ch) { unsigned int Value; int Size = NumberOfUTF8Chars( *ch ); switch( Size ) { case 6: Value = ch[0] & 0x01; break; case 5: Value = ch[0] & 0x03; break; case 4: Value = ch[0] & 0x07; break; case 3: Value = ch[0] & 0x0F; break; case 2: Value = ch[0] & 0x1F; break; case 1: Value = ch[0]; } for ( int i= 1; i < Size; i++ ) { Value = Value << 6 | ch[i] & 0x3F; } return Value; } [/sourcecode] This code was never actually tested. Use it on your own risk.

Advertisements