85 lines
3.0 KiB
C
85 lines
3.0 KiB
C
|
|
#include "unicode.h"
|
|
|
|
|
|
size_t utf8_Decode(const char *utf8, size_t len, uint32_t *codepoint) {
|
|
// Some useful precomputed data
|
|
static const int trailing[256] =
|
|
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5};
|
|
static const uint32_t offsets[6] =
|
|
{0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080};
|
|
|
|
// decode the character
|
|
uint32_t output;
|
|
int trailingBytes;
|
|
|
|
// read the first byte
|
|
uint8_t first = *utf8;
|
|
trailingBytes = trailing[first];
|
|
if ((first >= HELOS_UTF8_CONTINUATION_MIN && first <= HELOS_UTF8_OVERLONG_LEADER_2) ||
|
|
(first >= HELOS_UTF8_INVALID_LEADER_MIN && first <= HELOS_UTF8_INVALID_LEADER_MAX) ||
|
|
trailingBytes + 1 > len) {
|
|
// corrupted data or incomplete character
|
|
trailingBytes = 0;
|
|
if (codepoint != 0)
|
|
(*codepoint) = HELOS_UNICODE_ERROR;
|
|
} else if (codepoint != 0) {
|
|
output = 0;
|
|
|
|
// so elegant!
|
|
switch (trailingBytes) {
|
|
case 5: output += (uint8_t)(*utf8++); output <<= 6;
|
|
case 4: output += (uint8_t)(*utf8++); output <<= 6;
|
|
case 3: output += (uint8_t)(*utf8++); output <<= 6;
|
|
case 2: output += (uint8_t)(*utf8++); output <<= 6;
|
|
case 1: output += (uint8_t)(*utf8++); output <<= 6;
|
|
case 0: output += (uint8_t)(*utf8++);
|
|
}
|
|
|
|
(*codepoint) = output - offsets[trailingBytes];
|
|
}
|
|
|
|
return trailingBytes + 1;
|
|
}
|
|
|
|
size_t utf8_EncodeLength(uint32_t codepoint) {
|
|
if (codepoint <= 0x007f) // 0000 ~ 007F
|
|
return 1;
|
|
else if (codepoint <= 0x07ff) // 0080 ~ 07FF
|
|
return 2;
|
|
else if (codepoint <= 0xffff) // 0800 ~ FFFF
|
|
return 3;
|
|
else if (codepoint <= 0x10ffff) // 10000 ~ 10FFFF
|
|
return 4;
|
|
|
|
return 0; // invalid
|
|
}
|
|
|
|
size_t utf8_Encode(char *utf8, uint32_t codepoint) {
|
|
// Some useful precomputed data
|
|
static const uint8_t firstBytes[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
|
|
|
|
if (codepoint > HELOS_UNICODE_MAX)
|
|
codepoint = HELOS_UNICODE_ERROR; // substitute invalid codepoint
|
|
|
|
// get the number of bytes to write
|
|
size_t len = utf8_EncodeLength(codepoint);
|
|
// write the bytes
|
|
// so elegant also!
|
|
switch (len) {
|
|
case 4: utf8[3] = (char)((codepoint | 0x80) & 0xBF); codepoint >>= 6;
|
|
case 3: utf8[2] = (char)((codepoint | 0x80) & 0xBF); codepoint >>= 6;
|
|
case 2: utf8[1] = (char)((codepoint | 0x80) & 0xBF); codepoint >>= 6;
|
|
case 1: utf8[0] = (char)(codepoint | firstBytes[len]);
|
|
}
|
|
|
|
return len;
|
|
}
|