/********************************************************/ /* ntapi: Native API core library */ /* Copyright (C) 2013--2017 Z. Gilboa */ /* Released under GPLv2 and GPLv3; see COPYING.NTAPI. */ /********************************************************/ #include #include #include /** * unofficial bit distribution table for comprehension purposes only * * scalar nickname utf-16 utf-8[0] utf-8[1] utf-8[2] utf-8[3] * ------ -------- -------- -------- -------- -------- -------- * 00000000 7x 00000000 0xxxxxxx * 0xxxxxxx 0xxxxxxx * * 00000yyy 5y6x 00000yyy 110yyyyy 10xxxxxx * yyxxxxxx yyxxxxxx * * zzzzyyyy 4z6y6x zzzzyyyy 1110zzzz 10yyyyyy 10xxxxxx * yyxxxxxx yyxxxxxx * * 000uuuuu 5u4z6y6x 110110ww 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx * zzzzyyyy wwzzzzyy * yyxxxxxx 110111yy * yyxxxxxx (where wwww = uuuuu - 1) * * * validation of utf-8 * * from to utf-8[0] utf-8[1] utf-8[2] utf-8[3] * ------ ------ -------- -------- -------- -------- * 0x0000 0x007F 00..7F * 0x0080 0x07FF C2..DF 80..BF * 0x0800 0x0FFF E0 A0..BF 80..BF * 0x1000 0xCFFF E1..EC 80..BF 80..BF * 0xD000 0xD7FF ED 80..9F 80..BF * 0xE000 0xFFFF EE..EF 80..BF 80..BF * 0x10000 0x3FFFF F0 90..BF 80..BF 80..BF * 0x40000 0xFFFFF F1..F3 80..BF 80..BF 80..BF * 0x100000 0x10FFFF F4 80..8F 80..BF 80..BF * **/ #define __AVAILABLE_CODE_POINTS 0x110000 int __stdcall __ntapi_uc_get_code_point_byte_count_utf8(uint32_t code_point) { /* try clearing 7x bits */ if ((code_point >> 7) == 0) return 1; /* try clearing 5y + 6x bits */ else if ((code_point >> 11) == 0) return 2; /* try clearing 4z +6y + 6x bits */ else if ((code_point >> 16) == 0) return 3; /* try clearing 5u + 4z + 6y + 6x bits */ else if ((code_point >> 21) == 0) return 4; /* __AVAILABLE_CODE_POINTS exceeded */ else return 0; } int __stdcall __ntapi_uc_get_code_point_byte_count_utf16(uint32_t code_point) { /* try clearing 4z +6y + 6x bits */ if ((code_point >> 16) == 0) return 2; /* try clearing 5u + 4z + 6y + 6x bits */ else if ((code_point >> 21) == 0) return 4; /* __AVAILABLE_CODE_POINTS exceeded */ else return 0; } /** * following is a straight-forward implementation * of unicode conversion and validation (see also: * Table 3-7 of the Unicode Standard, version 6.2). * * the use of callbacks allows the validation * functions to be the basis of our utf-8 conversion * functions on the one hand, and the posix path arg * normalization routine on the other. **/ static int32_t __fastcall __default_callback_fn_utf8(nt_utf8_callback_args * args) { args->src += args->byte_count; return NT_STATUS_SUCCESS; } int32_t __stdcall __ntapi_uc_validate_unicode_stream_utf8( __in const unsigned char * ch, __in size_t size_in_bytes __optional, __out size_t * code_points __optional, __out const unsigned char ** addr_failed __optional, __in ntapi_uc_utf8_callback_fn ** callback_fn __optional, __in nt_utf8_callback_args * callback_args __optional) { int32_t status; const unsigned char * utf8; unsigned char * ch_boundary; unsigned char byte_count; size_t _code_points; ntapi_uc_utf8_callback_fn * _callback_fn[5]; nt_utf8_callback_args _callback_args; if (!callback_fn) { _callback_fn[0] = __default_callback_fn_utf8; _callback_fn[1] = __default_callback_fn_utf8; _callback_fn[2] = __default_callback_fn_utf8; _callback_fn[3] = __default_callback_fn_utf8; _callback_fn[4] = __default_callback_fn_utf8; callback_fn = (ntapi_uc_utf8_callback_fn **)&_callback_fn; } if (!callback_args) { callback_args = &_callback_args; callback_args->src = (unsigned char *)0; } if (callback_args->src) ch = callback_args->src; else callback_args->src = ch; if (size_in_bytes) ch_boundary = (unsigned char *)((uintptr_t)ch + size_in_bytes); else ch_boundary = (unsigned char *)(~0); if (!code_points) code_points = &_code_points; while ((ch < ch_boundary) && (*ch)) { utf8 = ch; byte_count = 0; /* try one byte */ if (utf8[0] <= 0x7F) byte_count = 1; /* try two bytes */ else if ((++ch < ch_boundary) && (utf8[0] >= 0xC2) && (utf8[0] <= 0xDF) && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)) byte_count = 2; /* try three bytes */ else if ((++ch < ch_boundary) && (utf8[0] == 0xE0) && (utf8[1] >= 0xA0) && (utf8[1] <= 0xBF) && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)) byte_count = 3; else if ( (utf8[0] >= 0xE1) && (utf8[0] <= 0xEC) && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF) && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)) byte_count = 3; else if ( (utf8[0] == 0xED) && (utf8[1] >= 0x80) && (utf8[1] <= 0x9F) && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)) byte_count = 3; else if ( (utf8[0] >= 0xEE) && (utf8[0] <= 0xEF) && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF) && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)) byte_count = 3; /* try four bytes */ else if ((++ch < ch_boundary) && (utf8[0] == 0xF0) && (utf8[1] >= 0x90) && (utf8[1] <= 0xBF) && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF) && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF)) byte_count = 4; else if ( (utf8[0] >= 0xF1) && (utf8[0] <= 0xF3) && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF) && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF) && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF)) byte_count = 4; else if ( (utf8[0] == 0xF4) && (utf8[1] >= 0x80) && (utf8[1] <= 0x8F) && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF) && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF)) byte_count = 4; if (byte_count) { (*code_points)++; callback_args->byte_count = byte_count; if ((status = callback_fn[byte_count](callback_args))) return status; } else { if (addr_failed) *addr_failed = (void *)utf8; return NT_STATUS_ILLEGAL_CHARACTER; } /* advance, transcode if needed */ ch = callback_args->src; } if ((ch < ch_boundary) && (*ch == 0)) return callback_fn[0](callback_args); return NT_STATUS_SUCCESS; } static int32_t __fastcall __default_callback_fn_utf16(nt_utf16_callback_args * args) { if (args->byte_count == 4) args->src += 2; else args->src++; return NT_STATUS_SUCCESS; } int32_t __stdcall __ntapi_uc_validate_unicode_stream_utf16( __in const wchar16_t * wch, __in size_t size_in_bytes __optional, __out size_t * code_points __optional, __out const wchar16_t ** addr_failed __optional, __in ntapi_uc_utf16_callback_fn ** callback_fn __optional, __in nt_utf16_callback_args * callback_args __optional) { int32_t status; const wchar16_t * wch_trail; wchar16_t * wch_boundary; unsigned char byte_count; size_t _code_points; ntapi_uc_utf16_callback_fn * _callback_fn[5]; nt_utf16_callback_args _callback_args; if (!callback_fn) { _callback_fn[0] = __default_callback_fn_utf16; _callback_fn[1] = __default_callback_fn_utf16; _callback_fn[2] = __default_callback_fn_utf16; _callback_fn[3] = __default_callback_fn_utf16; _callback_fn[4] = __default_callback_fn_utf16; callback_fn = (ntapi_uc_utf16_callback_fn **)&_callback_fn; } if (!callback_args) { callback_args = &_callback_args; callback_args->src = (wchar16_t *)0; } if (callback_args->src) wch = callback_args->src; else callback_args->src = wch; if (size_in_bytes) wch_boundary = (wchar16_t *)((uintptr_t)wch + size_in_bytes); else wch_boundary = (wchar16_t *)(~0); if (!code_points) code_points = &_code_points; while ((wch < wch_boundary) && (*wch)) { byte_count = 0; /* try one byte */ if (*wch <= 0x7F) byte_count = 1; /* try two bytes */ else if (*wch <= 0x7FF) byte_count = 2; /* try three bytes */ else if ((*wch < 0xD800) || (*wch >= 0xE000)) byte_count = 3; /* try four bytes */ else if ((*wch >= 0xD800) && (*wch < 0xDC00)) { wch_trail = wch + 1; if ((wch_trail < wch_boundary) && (*wch_trail >= 0xDC00) && (*wch_trail < 0xE000)) byte_count = 4; } if (byte_count) { (*code_points)++; callback_args->byte_count = byte_count; if ((status = callback_fn[byte_count](callback_args))) return status; } else { if (addr_failed) *addr_failed = (void *)wch; return NT_STATUS_ILLEGAL_CHARACTER; } /* advance, transcode as needed */ wch = callback_args->src; } if ((wch < wch_boundary) && (*wch == 0)) return callback_fn[0](callback_args); return NT_STATUS_SUCCESS; }