helos1/runtime/unicode.c


#include "unicode.h"


size_t utf8_Decode(const char *utf8, size_t len, uint32_t *codepoint) {
	// Some useful precomputed data
	static const int trailing[256] =
		{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
		 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5};
	static const uint32_t offsets[6] =
		{0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080};

	// decode the character
	uint32_t output;
	int      trailingBytes;

	// read the first byte
	uint8_t first = *utf8;
	trailingBytes = trailing[first];
	if ((first >= HELOS_UTF8_CONTINUATION_MIN && first <= HELOS_UTF8_OVERLONG_LEADER_2) ||
		(first >= HELOS_UTF8_INVALID_LEADER_MIN && first <= HELOS_UTF8_INVALID_LEADER_MAX) ||
		trailingBytes + 1 > len) {
		// corrupted data or incomplete character
		trailingBytes = 0;
		if (codepoint != 0)
			(*codepoint) = HELOS_UNICODE_ERROR;
	} else if (codepoint != 0) {
		output = 0;

		// so elegant!
		switch (trailingBytes) {
			case 5: output += (uint8_t)(*utf8++); output <<= 6;
			case 4: output += (uint8_t)(*utf8++); output <<= 6;
			case 3: output += (uint8_t)(*utf8++); output <<= 6;
			case 2: output += (uint8_t)(*utf8++); output <<= 6;
			case 1: output += (uint8_t)(*utf8++); output <<= 6;
			case 0: output += (uint8_t)(*utf8++);
		}

		(*codepoint) = output - offsets[trailingBytes];
	}

	return trailingBytes + 1;
}

size_t utf8_EncodeLength(uint32_t codepoint) {
	if (codepoint <= 0x007f) // 0000 ~ 007F
		return 1;
	else if (codepoint <= 0x07ff) // 0080 ~ 07FF
		return 2;
	else if (codepoint <= 0xffff) // 0800 ~ FFFF
		return 3;
	else if (codepoint <= 0x10ffff) // 10000 ~ 10FFFF
		return 4;

	return 0; // invalid
}

size_t utf8_Encode(char *utf8, uint32_t codepoint) {
	// Some useful precomputed data
	static const uint8_t firstBytes[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};

	if (codepoint > HELOS_UNICODE_MAX)
		codepoint = HELOS_UNICODE_ERROR; // substitute invalid codepoint

	// get the number of bytes to write
	size_t len = utf8_EncodeLength(codepoint);
	// write the bytes
	// so elegant also!
	switch (len) {
		case 4: utf8[3] = (char)((codepoint | 0x80) & 0xBF); codepoint >>= 6;
		case 3: utf8[2] = (char)((codepoint | 0x80) & 0xBF); codepoint >>= 6;
		case 2: utf8[1] = (char)((codepoint | 0x80) & 0xBF); codepoint >>= 6;
		case 1: utf8[0] = (char)(codepoint | firstBytes[len]);
	}

	return len;
}
Initial commit 2021-10-10 14:39:17 +08:00
			`#include "unicode.h"`


			`size_t utf8_Decode(const char utf8, size_t len, uint32_t codepoint) {`
			`// Some useful precomputed data`
			`static const int trailing[256] =`
			`{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5};`
			`static const uint32_t offsets[6] =`
			`{0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080};`

			`// decode the character`
			`uint32_t output;`
			`int trailingBytes;`

			`// read the first byte`
			`uint8_t first = *utf8;`
			`trailingBytes = trailing[first];`
			`if ((first >= HELOS_UTF8_CONTINUATION_MIN && first <= HELOS_UTF8_OVERLONG_LEADER_2) \|\|`
			`(first >= HELOS_UTF8_INVALID_LEADER_MIN && first <= HELOS_UTF8_INVALID_LEADER_MAX) \|\|`
			`trailingBytes + 1 > len) {`
			`// corrupted data or incomplete character`
			`trailingBytes = 0;`
			`if (codepoint != 0)`
			`(*codepoint) = HELOS_UNICODE_ERROR;`
			`} else if (codepoint != 0) {`
			`output = 0;`

			`// so elegant!`
			`switch (trailingBytes) {`
			`case 5: output += (uint8_t)(*utf8++); output <<= 6;`
			`case 4: output += (uint8_t)(*utf8++); output <<= 6;`
			`case 3: output += (uint8_t)(*utf8++); output <<= 6;`
			`case 2: output += (uint8_t)(*utf8++); output <<= 6;`
			`case 1: output += (uint8_t)(*utf8++); output <<= 6;`
			`case 0: output += (uint8_t)(*utf8++);`
			`}`

			`(*codepoint) = output - offsets[trailingBytes];`
			`}`

			`return trailingBytes + 1;`
			`}`

			`size_t utf8_EncodeLength(uint32_t codepoint) {`
			`if (codepoint <= 0x007f) // 0000 ~ 007F`
			`return 1;`
			`else if (codepoint <= 0x07ff) // 0080 ~ 07FF`
			`return 2;`
			`else if (codepoint <= 0xffff) // 0800 ~ FFFF`
			`return 3;`
			`else if (codepoint <= 0x10ffff) // 10000 ~ 10FFFF`
			`return 4;`

			`return 0; // invalid`
			`}`

			`size_t utf8_Encode(char *utf8, uint32_t codepoint) {`
			`// Some useful precomputed data`
			`static const uint8_t firstBytes[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};`

			`if (codepoint > HELOS_UNICODE_MAX)`
			`codepoint = HELOS_UNICODE_ERROR; // substitute invalid codepoint`

			`// get the number of bytes to write`
			`size_t len = utf8_EncodeLength(codepoint);`
			`// write the bytes`
			`// so elegant also!`
			`switch (len) {`
			`case 4: utf8[3] = (char)((codepoint \| 0x80) & 0xBF); codepoint >>= 6;`
			`case 3: utf8[2] = (char)((codepoint \| 0x80) & 0xBF); codepoint >>= 6;`
			`case 2: utf8[1] = (char)((codepoint \| 0x80) & 0xBF); codepoint >>= 6;`
			`case 1: utf8[0] = (char)(codepoint \| firstBytes[len]);`
			`}`

			`return len;`
			`}`