From dd89bb8ad4fe184a34b5dbdda237e640fc82121b Mon Sep 17 00:00:00 2001 From: midipix Date: Mon, 27 Jul 2015 04:01:18 -0400 Subject: entered advanced internal development stage. --- .../ntapi_uc_unicode_conversion_from_utf16.c | 287 ++++++++++++++++++ .../ntapi_uc_unicode_conversion_from_utf8.c | 288 ++++++++++++++++++ src/unicode/ntapi_uc_unicode_validation.c | 329 +++++++++++++++++++++ 3 files changed, 904 insertions(+) create mode 100644 src/unicode/ntapi_uc_unicode_conversion_from_utf16.c create mode 100644 src/unicode/ntapi_uc_unicode_conversion_from_utf8.c create mode 100644 src/unicode/ntapi_uc_unicode_validation.c (limited to 'src/unicode') diff --git a/src/unicode/ntapi_uc_unicode_conversion_from_utf16.c b/src/unicode/ntapi_uc_unicode_conversion_from_utf16.c new file mode 100644 index 0000000..102a24d --- /dev/null +++ b/src/unicode/ntapi_uc_unicode_conversion_from_utf16.c @@ -0,0 +1,287 @@ +/********************************************************/ +/* ntapi: Native API core library */ +/* Copyright (C) 2013,2014,2015 Z. Gilboa */ +/* Released under GPLv2 and GPLv3; see COPYING.NTAPI. */ +/********************************************************/ + +#include +#include +#include +#include "ntapi_impl.h" + + +static int32_t __fastcall __utf16_to_utf8_handler_1byte_or_null_termination(nt_utf16_callback_args * args) +{ + /*******************************************/ + /* from: 00000000 0xxxxxxx (little endian) */ + /* to: 0xxxxxxx (utf-8) */ + /*******************************************/ + + uint8_t * dst; + + if (args->dst >= args->dst_cap) + return NT_STATUS_BUFFER_TOO_SMALL; + + dst = (uint8_t *)args->dst; + *dst = *(uint8_t *)(args->src); + + /* advance source and destination buffer */ + args->src++; + args->dst = (void *)((uintptr_t)(args->dst) + 1); + + /* bytes_written */ + args->bytes_written++; + + return NT_STATUS_SUCCESS; +} + + +static int32_t __fastcall __utf16_to_utf8_handler_2bytes(nt_utf16_callback_args * args) +{ + /*******************************************/ + /* from: 00000yyy yyxxxxxx (little endian) */ + /* to: 110yyyyy 10xxxxxx (utf-8) */ + /*******************************************/ + + const wchar16_t * src; + uint8_t * dst; + + wchar16_t wx; + wchar16_t wy; + + if ((uintptr_t)(args->dst) + 1 >= (uintptr_t)(args->dst_cap)) + return NT_STATUS_BUFFER_TOO_SMALL; + + src = args->src; + dst = (uint8_t *)args->dst; + + wy = *src; + wy >>= 6; + + wx = *src; + wx <<= 10; + wx >>= 10; + + /* write the y part */ + *dst = (char)(0xC0 | wy); + dst++; + + /* write the x part */ + *dst = (char)(0x80 | wx); + + /* advance source and destination buffer */ + args->src++; + args->dst = (void *)((uintptr_t)(args->dst) + 2); + + /* bytes_written */ + args->bytes_written += 2; + + return NT_STATUS_SUCCESS; +} + + +static int32_t __fastcall __utf16_to_utf8_handler_3bytes(nt_utf16_callback_args * args) +{ + /********************************************/ + /* from: zzzzyyyy yyxxxxxx (little endian) */ + /* to: 1110zzzz 10yyyyyy 10xxxxxx (utf-8) */ + /********************************************/ + + const wchar16_t * src; + uint8_t * dst; + + wchar16_t wx; + wchar16_t wy; + wchar16_t wz; + + if ((uintptr_t)(args->dst) + 2 >= (uintptr_t)(args->dst_cap)) + return NT_STATUS_BUFFER_TOO_SMALL; + + src = args->src; + dst = (uint8_t *)args->dst; + + wz = *src; + wz >>= 12; + + wy = *src; + wy <<= 4; + wy >>= 10; + + wx = *src; + wx <<= 10; + wx >>= 10; + + /* write the z part */ + *dst = (char)(0xE0 | wz); + dst++; + + /* write the y part */ + *dst = (char)(0x80 | wy); + dst++; + + /* write the x part */ + *dst = (char)(0x80 | wx); + + /* advance source and destination buffer */ + args->src++; + args->dst = (void *)((uintptr_t)(args->dst) + 3); + + /* bytes_written */ + args->bytes_written += 3; + + return NT_STATUS_SUCCESS; +} + + +static int32_t __fastcall __utf16_to_utf8_handler_4bytes(nt_utf16_callback_args * args) +{ + /****************************************************************/ + /* from: 110110ww wwzzzzyy 110111yy yyxxxxxx (little endian) */ + /* to: 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx (utf-8) */ + /****************************************************************/ + + const wchar16_t * src; + uint8_t * dst; + + wchar16_t wx; + wchar16_t wz; + + wchar16_t wy_low; + wchar16_t wy_high; + wchar16_t ww; + wchar16_t uuuuu; + wchar16_t u_low; + wchar16_t u_high; + + if ((uintptr_t)(args->dst) + 3 >= (uintptr_t)(args->dst_cap)) + return NT_STATUS_BUFFER_TOO_SMALL; + + src = args->src; + dst = (uint8_t *)args->dst; + + /* low two bytes */ + wx = *src; + wx <<= 10; + wx >>= 10; + + wy_low = *src; + wy_low <<= 6; + wy_low >>= 12; + + /* (surrogate pair) */ + src++; + + /* high two bytes */ + wy_high = *src; + wy_high <<= 14; + wy_high >>= 10; + + wz = *src; + wz <<= 10; + wz >>= 12; + wz <<= 2; + + ww = *src; + ww <<= 6; + ww >>= 12; + + uuuuu = ww + 1; + u_high = uuuuu >> 2; + u_low = ((uuuuu << 14) >> 10); + + /* 1st byte: 11110uuu */ + *dst = (char)(0xF0 | u_high); + dst++; + + /* 2nd byte: 10uuzzzz */ + *dst = (char)(0x80 | u_low | wz); + dst++; + + /* 3rd byte: 10yyyyyy */ + *dst = (char)(0x80 | wy_low | wy_high); + dst++; + + /* 4th byte: 10xxxxxx */ + *dst = (char)(0x80 | wx); + + /* advance source and destination buffer */ + args->src += 2; + args->dst = (void *)((uintptr_t)(args->dst) + 4); + + /* bytes_written */ + args->bytes_written += 4; + + return NT_STATUS_SUCCESS; +} + + +static int32_t __fastcall __update_stream_leftover_info_utf16( + __in_out nt_unicode_conversion_params_utf16_to_utf8 * params) +{ + int32_t status; + ptrdiff_t offset; + wchar16_t * wlead; + + offset = (uintptr_t)params->src + (uintptr_t)params->src_size_in_bytes - (uintptr_t)params->addr_failed; + wlead = (wchar16_t *)params->addr_failed; + + + if ((offset == 2) && (*wlead >= 0xD800) && (*wlead < 0xDC00)) { + /* possibly the lead of a surrogate pair lead */ + params->leftover_count = 2; + params->leftover_bytes = *wlead; + params->leftover_bytes <<= 16; + status = NT_STATUS_SUCCESS; + } else { + params->leftover_count = 0; + params->leftover_bytes = 0; + status = NT_STATUS_ILLEGAL_CHARACTER; + } + + return status; +} + + +int32_t __stdcall __ntapi_uc_convert_unicode_stream_utf16_to_utf8( + __in_out nt_unicode_conversion_params_utf16_to_utf8 * params) +{ + int32_t status; + nt_utf16_callback_args args; + ntapi_uc_utf16_callback_fn * callback_fn[5]; + + callback_fn[0] = (ntapi_uc_utf16_callback_fn *)__utf16_to_utf8_handler_1byte_or_null_termination; + callback_fn[1] = (ntapi_uc_utf16_callback_fn *)__utf16_to_utf8_handler_1byte_or_null_termination; + callback_fn[2] = (ntapi_uc_utf16_callback_fn *)__utf16_to_utf8_handler_2bytes; + callback_fn[3] = (ntapi_uc_utf16_callback_fn *)__utf16_to_utf8_handler_3bytes; + callback_fn[4] = (ntapi_uc_utf16_callback_fn *)__utf16_to_utf8_handler_4bytes; + + args.src = params->src; + args.dst = params->dst; + args.dst_cap = (void *)((uintptr_t)(params->dst) + (params->dst_size_in_bytes)); + args.bytes_written = params->bytes_written; + + status = __ntapi_uc_validate_unicode_stream_utf16( + params->src, + params->src_size_in_bytes, + ¶ms->code_points, + ¶ms->addr_failed, + callback_fn, + &args); + + params->bytes_written = args.bytes_written; + + if (status) + status = __update_stream_leftover_info_utf16(params); + + /* the following bit shift will be optimized out on 32-bit architectures */ + params->leftover_bytes <<= (8 * (sizeof(uintptr_t) - sizeof(uint32_t))); + + return status; +} + + +int32_t __stdcall __ntapi_uc_convert_unicode_stream_utf16_to_utf32( + __in_out nt_unicode_conversion_params_utf16_to_utf32 * params) +{ + return NT_STATUS_SUCCESS; +} diff --git a/src/unicode/ntapi_uc_unicode_conversion_from_utf8.c b/src/unicode/ntapi_uc_unicode_conversion_from_utf8.c new file mode 100644 index 0000000..02976ea --- /dev/null +++ b/src/unicode/ntapi_uc_unicode_conversion_from_utf8.c @@ -0,0 +1,288 @@ +/********************************************************/ +/* ntapi: Native API core library */ +/* Copyright (C) 2013,2014,2015 Z. Gilboa */ +/* Released under GPLv2 and GPLv3; see COPYING.NTAPI. */ +/********************************************************/ + +#include +#include +#include +#include "ntapi_impl.h" + + +typedef struct ___two_bytes { + unsigned char low; + unsigned char high; +} __two_bytes; + + +typedef struct ___three_bytes { + unsigned char low; + unsigned char middle; + unsigned char high; +} __three_bytes; + + +static int32_t __fastcall __utf8_to_utf16_handler_1byte_or_null_termination(nt_utf8_callback_args * args) +{ + /***************************/ + /* from: 0xxxxxxx */ + /* to: 00000000 0xxxxxxx */ + /***************************/ + + wchar16_t * dst; + + if (args->dst >= args->dst_cap) + return NT_STATUS_BUFFER_TOO_SMALL; + + dst = (wchar16_t *)args->dst; + *dst = *(args->src); + + /* advance source and destination buffer */ + args->src++; + args->dst = (void *)((uintptr_t)(args->dst) + sizeof(wchar16_t)); + + /* bytes_written */ + args->bytes_written += sizeof(wchar16_t); + + return NT_STATUS_SUCCESS; +} + + +static int32_t __fastcall __utf8_to_utf16_handler_2bytes(nt_utf8_callback_args * args) +{ + /***************************/ + /* from: 110yyyyy 10xxxxxx */ + /* to: 00000yyy yyxxxxxx */ + /***************************/ + + __two_bytes * src; /* big endian */ + wchar16_t * dst; + + if (args->dst >= args->dst_cap) + return NT_STATUS_BUFFER_TOO_SMALL; + + src = (__two_bytes *)args->src; + dst = (wchar16_t *)args->dst; + + /* yyyyy */ + *dst = (src->low ^ 0xC0); + *dst <<= 6; + + /* xxxxxx */ + *dst |= (src->high ^ 0x80); + + /* advance source and destination buffer */ + args->src += 2; + args->dst = (void *)((uintptr_t)(args->dst) + sizeof(wchar16_t)); + + /* bytes_written */ + args->bytes_written += sizeof(wchar16_t); + + return NT_STATUS_SUCCESS; +} + + +static int32_t __fastcall __utf8_to_utf16_handler_3bytes(nt_utf8_callback_args * args) +{ + /************************************/ + /* from: 1110zzzz 10yyyyyy 10xxxxxx */ + /* to: zzzzyyyy yyxxxxxx */ + /************************************/ + + __three_bytes * src; /* big endian */ + wchar16_t * dst; + wchar16_t yyyyy; + + if (args->dst >= args->dst_cap) + return NT_STATUS_BUFFER_TOO_SMALL; + + src = (__three_bytes *)args->src; + dst = (wchar16_t *)args->dst; + + /* zzzz */ + *dst = (src->low ^ 0xE0); + *dst <<= 12; + + /* yyyyy */ + yyyyy = (src->middle ^ 0x80); + yyyyy <<= 6; + *dst |= yyyyy; + + /* xxxxxx */ + *dst |= (src->high ^ 0x80); + + /* advance source and destination buffer */ + args->src += 3; + args->dst = (void *)((uintptr_t)(args->dst) + sizeof(wchar16_t)); + + /* bytes_written */ + args->bytes_written += sizeof(wchar16_t); + + return NT_STATUS_SUCCESS; +} + + +static int32_t __fastcall __utf8_to_utf16_handler_4bytes(nt_utf8_callback_args * args) +{ + /*************************************************/ + /* from: 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx */ + /* to: 110110ww wwzzzzyy 110111yy yyxxxxxx */ + /*************************************************/ + + __two_bytes * src_low; /* big endian */ + __two_bytes * src_high; /* big endian */ + wchar16_t * dst_lead; + wchar16_t * dst_trail; + + wchar16_t u; + unsigned char ulow; + unsigned char uhigh; + unsigned char yyyy; + + dst_lead = dst_trail = (wchar16_t *)args->dst; + dst_trail++; + + if ((uintptr_t)dst_trail >= (uintptr_t)args->dst_cap) + return NT_STATUS_BUFFER_TOO_SMALL; + + src_low = src_high = (__two_bytes *)args->src; + src_high++; + + /* u */ + ulow = src_low->low ^ 0xF0; + uhigh = src_low->high ^ 0x80; + + ulow <<= 2; + uhigh >>= 4; + + u = ulow | uhigh; + + /* 110110ww wwzzzzyy */ + *dst_lead = 0xD800; + *dst_lead |= ((u-1) << 6); + *dst_lead |= ((src_low->high ^ 0x80) << 2); + *dst_lead |= ((src_high->low ^ 0x80) >> 4); + + /* 110111yy yyxxxxxx */ + yyyy = (src_high->low << 4); + *dst_trail = yyyy; + *dst_trail <<= 2; + *dst_trail |= (src_high->high ^ 0x80); + *dst_trail |= 0xDC00; + + /* advance source and destination buffer */ + args->src += 4; + args->dst = (void *)((uintptr_t)(args->dst) + (2 * sizeof(wchar16_t))); + + /* bytes_written */ + args->bytes_written += 2 * sizeof(wchar16_t); + + return NT_STATUS_SUCCESS; +} + + +static int32_t __fastcall __update_stream_leftover_info_utf8( + __in_out nt_unicode_conversion_params_utf8_to_utf16 * params) +{ + int32_t status; + ptrdiff_t offset; + unsigned char * utf8; + + offset = (uintptr_t)params->src + (uintptr_t)params->src_size_in_bytes - (uintptr_t)params->addr_failed; + utf8 = (unsigned char *)params->addr_failed; + + /* default status */ + status = NT_STATUS_ILLEGAL_CHARACTER; + + if (offset == 1) { + if ((utf8[0] >= 0xC2) && (utf8[0] <= 0xF4)) { + /* one leftover byte */ + params->leftover_count = 1; + params->leftover_bytes = utf8[0]; + params->leftover_bytes <<= 24; + status = NT_STATUS_SUCCESS; + } + } else if (offset == 2) { + if /* ------- */ (((utf8[0] == 0xE0) && (utf8[1] >= 0xA0) && (utf8[1] <= 0xBF)) + || ((utf8[0] >= 0xE1) && (utf8[0] <= 0xEC) && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)) + || ((utf8[0] == 0xED) && (utf8[1] >= 0x80) && (utf8[1] <= 0x9F)) + || ((utf8[0] >= 0xEE) && (utf8[0] <= 0xEF) && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)) + || ((utf8[0] == 0xF0) && (utf8[1] >= 0x90) && (utf8[1] <= 0xBF)) + || ((utf8[0] >= 0xF1) && (utf8[0] <= 0xF3) && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)) + || ((utf8[0] == 0xF4) && (utf8[1] >= 0x80) && (utf8[1] <= 0x8F))) { + /* two leftover bytes */ + params->leftover_count = 2; + params->leftover_bytes = utf8[0]; + params->leftover_bytes <<= 8; + params->leftover_bytes += utf8[1]; + params->leftover_bytes <<= 16; + status = NT_STATUS_SUCCESS; + } + } else if (offset == 3) { + if /* ------- */ (((utf8[0] == 0xF0) && (utf8[1] >= 0x90) && (utf8[1] <= 0xBF)) + || ((utf8[0] >= 0xF1) && (utf8[0] <= 0xF3) && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)) + || ((utf8[0] == 0xF4) && (utf8[1] >= 0x80) && (utf8[1] <= 0x8F))) { + /* three leftover bytes */ + params->leftover_count = 3; + params->leftover_bytes = utf8[0]; + params->leftover_bytes <<= 8; + params->leftover_bytes += utf8[1]; + params->leftover_bytes <<= 8; + params->leftover_bytes += utf8[2]; + params->leftover_bytes <<= 8; + status = NT_STATUS_SUCCESS; + } + } + + if (status != NT_STATUS_SUCCESS) { + params->leftover_count = 0; + params->leftover_bytes = 0; + } + + return status; +} + +int32_t __stdcall __ntapi_uc_convert_unicode_stream_utf8_to_utf16( + __in_out nt_unicode_conversion_params_utf8_to_utf16 * params) +{ + int32_t status; + nt_utf8_callback_args args; + ntapi_uc_utf8_callback_fn * callback_fn[5]; + + callback_fn[0] = (ntapi_uc_utf8_callback_fn *)__utf8_to_utf16_handler_1byte_or_null_termination; + callback_fn[1] = (ntapi_uc_utf8_callback_fn *)__utf8_to_utf16_handler_1byte_or_null_termination; + callback_fn[2] = (ntapi_uc_utf8_callback_fn *)__utf8_to_utf16_handler_2bytes; + callback_fn[3] = (ntapi_uc_utf8_callback_fn *)__utf8_to_utf16_handler_3bytes; + callback_fn[4] = (ntapi_uc_utf8_callback_fn *)__utf8_to_utf16_handler_4bytes; + + args.src = params->src; + args.dst = params->dst; + args.dst_cap = (void *)((uintptr_t)(params->dst) + (params->dst_size_in_bytes)); + args.bytes_written = params->bytes_written; + + status = __ntapi_uc_validate_unicode_stream_utf8( + params->src, + params->src_size_in_bytes, + ¶ms->code_points, + ¶ms->addr_failed, + callback_fn, + &args); + + params->bytes_written = args.bytes_written; + + if (status != NT_STATUS_SUCCESS) + status = __update_stream_leftover_info_utf8(params); + + /* (optimized out on 32-bit architectures) */ + params->leftover_bytes <<= (8 * (sizeof(uintptr_t) - sizeof(uint32_t))); + + return status; +} + + +int32_t __stdcall __ntapi_uc_convert_unicode_stream_utf8_to_utf32( + __in_out nt_unicode_conversion_params_utf8_to_utf32 * params) +{ + return NT_STATUS_SUCCESS; +} diff --git a/src/unicode/ntapi_uc_unicode_validation.c b/src/unicode/ntapi_uc_unicode_validation.c new file mode 100644 index 0000000..4c6fcac --- /dev/null +++ b/src/unicode/ntapi_uc_unicode_validation.c @@ -0,0 +1,329 @@ +/********************************************************/ +/* ntapi: Native API core library */ +/* Copyright (C) 2013,2014,2015 Z. Gilboa */ +/* Released under GPLv2 and GPLv3; see COPYING.NTAPI. */ +/********************************************************/ + +#include +#include +#include + +/** + * unofficial bit distribution table for comprehension purposes only + * + * scalar nickname utf-16 utf-8[0] utf-8[1] utf-8[2] utf-8[3] + * ------ -------- -------- -------- -------- -------- -------- + * 00000000 7x 00000000 0xxxxxxx + * 0xxxxxxx 0xxxxxxx + * + * 00000yyy 5y6x 00000yyy 110yyyyy 10xxxxxx + * yyxxxxxx yyxxxxxx + * + * zzzzyyyy 4z6y6x zzzzyyyy 1110zzzz 10yyyyyy 10xxxxxx + * yyxxxxxx yyxxxxxx + * + * 000uuuuu 5u4z6y6x 110110ww 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx + * zzzzyyyy wwzzzzyy + * yyxxxxxx 110111yy + * yyxxxxxx (where wwww = uuuuu - 1) + * + * + * validation of utf-8 + * + * from to utf-8[0] utf-8[1] utf-8[2] utf-8[3] + * ------ ------ -------- -------- -------- -------- + * 0x0000 0x007F 00..7F + * 0x0080 0x07FF C2..DF 80..BF + * 0x0800 0x0FFF E0 A0..BF 80..BF + * 0x1000 0xCFFF E1..EC 80..BF 80..BF + * 0xD000 0xD7FF ED 80..9F 80..BF + * 0xE000 0xFFFF EE..EF 80..BF 80..BF + * 0x10000 0x3FFFF F0 90..BF 80..BF 80..BF + * 0x40000 0xFFFFF F1..F3 80..BF 80..BF 80..BF + * 0x100000 0x10FFFF F4 80..8F 80..BF 80..BF + * +**/ + + +#define __AVAILABLE_CODE_POINTS 0x110000 + +int __stdcall __ntapi_uc_get_code_point_byte_count_utf8(uint32_t code_point) +{ + /* try clearing 7x bits */ + if ((code_point >> 7) == 0) + return 1; + + /* try clearing 5y + 6x bits */ + else if ((code_point >> 11) == 0) + return 2; + + /* try clearing 4z +6y + 6x bits */ + else if ((code_point >> 16) == 0) + return 3; + + /* try clearing 5u + 4z + 6y + 6x bits */ + else if ((code_point >> 21) == 0) + return 4; + + /* __AVAILABLE_CODE_POINTS exceeded */ + else + return 0; +} + + +int __stdcall __ntapi_uc_get_code_point_byte_count_utf16(uint32_t code_point) +{ + /* try clearing 4z +6y + 6x bits */ + if ((code_point >> 16) == 0) + return 2; + + /* try clearing 5u + 4z + 6y + 6x bits */ + else if ((code_point >> 21) == 0) + return 4; + + /* __AVAILABLE_CODE_POINTS exceeded */ + else + return 0; +} + + +/** + * following is a straight-forward implementation + * of unicode conversion and validation (see also: + * Table 3-7 of the Unicode Standard, version 6.2). + * + * the use of callbacks allows the validation + * functions to be the basis of our utf-8 conversion + * functions on the one hand, and the posix path arg + * normalization routine on the other. +**/ + +static int32_t __fastcall __default_callback_fn_utf8(nt_utf8_callback_args * args) +{ + args->src += args->byte_count; + return NT_STATUS_SUCCESS; +} + +int32_t __stdcall __ntapi_uc_validate_unicode_stream_utf8( + __in const unsigned char * ch, + __in size_t size_in_bytes __optional, + __out size_t * code_points __optional, + __out void ** addr_failed __optional, + __in ntapi_uc_utf8_callback_fn ** callback_fn __optional, + __in nt_utf8_callback_args * callback_args __optional) +{ + const unsigned char * utf8; + unsigned char * ch_boundary; + unsigned char byte_count; + size_t _code_points; + + ntapi_uc_utf8_callback_fn * _callback_fn[5]; + nt_utf8_callback_args _callback_args; + + if (!callback_fn) { + _callback_fn[0] = __default_callback_fn_utf8; + _callback_fn[1] = __default_callback_fn_utf8; + _callback_fn[2] = __default_callback_fn_utf8; + _callback_fn[3] = __default_callback_fn_utf8; + _callback_fn[4] = __default_callback_fn_utf8; + callback_fn = (ntapi_uc_utf8_callback_fn **)&_callback_fn; + } + + if (!callback_args) { + callback_args = &_callback_args; + callback_args->src = (unsigned char *)0; + } + + if (callback_args->src) + ch = callback_args->src; + else + callback_args->src = ch; + + if (size_in_bytes) + ch_boundary = (unsigned char *)((uintptr_t)ch + size_in_bytes); + else + ch_boundary = (unsigned char *)(~0); + + if (!code_points) + code_points = &_code_points; + + while ((ch < ch_boundary) && (*ch)) { + utf8 = ch; + byte_count = 0; + + /* try one byte */ + if (utf8[0] <= 0x7F) + byte_count = 1; + + /* try two bytes */ + else if ((++ch < ch_boundary) + && (utf8[0] >= 0xC2) && (utf8[0] <= 0xDF) + && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)) + byte_count = 2; + + /* try three bytes */ + else if ((++ch < ch_boundary) + && (utf8[0] == 0xE0) + && (utf8[1] >= 0xA0) && (utf8[1] <= 0xBF) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)) + byte_count = 3; + + else if ( + (utf8[0] >= 0xE1) && (utf8[0] <= 0xEC) + && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)) + byte_count = 3; + + else if ( + (utf8[0] == 0xED) + && (utf8[1] >= 0x80) && (utf8[1] <= 0x9F) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)) + byte_count = 3; + + else if ( + (utf8[0] >= 0xEE) && (utf8[0] <= 0xEF) + && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)) + byte_count = 3; + + /* try four bytes */ + else if ((++ch < ch_boundary) + && (utf8[0] == 0xF0) + && (utf8[1] >= 0x90) && (utf8[1] <= 0xBF) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF) + && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF)) + byte_count = 4; + + else if ( + (utf8[0] >= 0xF1) && (utf8[0] <= 0xF3) + && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF) + && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF)) + byte_count = 4; + + else if ( + (utf8[0] == 0xF4) + && (utf8[1] >= 0x80) && (utf8[1] <= 0x8F) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF) + && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF)) + byte_count = 4; + + if (byte_count) { + (*code_points)++; + callback_args->byte_count = byte_count; + callback_fn[byte_count](callback_args); + } else { + if (addr_failed) + *addr_failed = (void *)utf8; + return NT_STATUS_ILLEGAL_CHARACTER; + } + + /* advance, transcode if needed */ + ch = callback_args->src; + } + + if ((ch < ch_boundary) && (*ch == 0)) + callback_fn[0](callback_args); + + return NT_STATUS_SUCCESS; +} + + +static int32_t __fastcall __default_callback_fn_utf16(nt_utf16_callback_args * args) +{ + if (args->byte_count == 4) + args->src += 2; + else + args->src++; + + return NT_STATUS_SUCCESS; +} + + +int32_t __stdcall __ntapi_uc_validate_unicode_stream_utf16( + __in const wchar16_t * wch, + __in size_t size_in_bytes __optional, + __out size_t * code_points __optional, + __out void ** addr_failed __optional, + __in ntapi_uc_utf16_callback_fn ** callback_fn __optional, + __in nt_utf16_callback_args * callback_args __optional) +{ + const wchar16_t * wch_trail; + wchar16_t * wch_boundary; + unsigned char byte_count; + size_t _code_points; + + ntapi_uc_utf16_callback_fn * _callback_fn[5]; + nt_utf16_callback_args _callback_args; + + if (!callback_fn) { + _callback_fn[0] = __default_callback_fn_utf16; + _callback_fn[1] = __default_callback_fn_utf16; + _callback_fn[2] = __default_callback_fn_utf16; + _callback_fn[3] = __default_callback_fn_utf16; + _callback_fn[4] = __default_callback_fn_utf16; + callback_fn = (ntapi_uc_utf16_callback_fn **)&_callback_fn; + } + + if (!callback_args) { + callback_args = &_callback_args; + callback_args->src = (wchar16_t *)0; + } + + if (callback_args->src) + wch = callback_args->src; + else + callback_args->src = wch; + + if (size_in_bytes) + wch_boundary = (wchar16_t *)((uintptr_t)wch + size_in_bytes); + else + wch_boundary = (wchar16_t *)(~0); + + if (!code_points) + code_points = &_code_points; + + while ((wch < wch_boundary) && (*wch)) { + byte_count = 0; + + /* try one byte */ + if (*wch <= 0x7F) + byte_count = 1; + + /* try two bytes */ + else if (*wch <= 0x7FF) + byte_count = 2; + + /* try three bytes */ + else if ((*wch < 0xD800) || (*wch >= 0xE000)) + byte_count = 3; + + /* try four bytes */ + else if ((*wch >= 0xD800) && (*wch < 0xDC00)) { + wch_trail = wch + 1; + + if ((wch_trail < wch_boundary) + && (*wch_trail >= 0xDC00) + && (*wch_trail < 0xE000)) + byte_count = 4; + } + + if (byte_count) { + (*code_points)++; + callback_args->byte_count = byte_count; + callback_fn[byte_count](callback_args); + } else { + if (addr_failed) + *addr_failed = (void *)wch; + return NT_STATUS_ILLEGAL_CHARACTER; + } + + /* advance, transcode as needed */ + wch = callback_args->src; + } + + if ((wch < wch_boundary) && (*wch == 0)) + callback_fn[0](callback_args); + + return NT_STATUS_SUCCESS; +} -- cgit v1.2.3