summaryrefslogtreecommitdiffhomepage
path: root/src/unicode
diff options
context:
space:
mode:
authormidipix <writeonce@midipix.org>2015-07-27 04:01:18 -0400
committermidipix <writeonce@midipix.org>2015-07-27 04:01:18 -0400
commitdd89bb8ad4fe184a34b5dbdda237e640fc82121b (patch)
tree5e80d2da35f5892f92be29f57982b2708e6bd99b /src/unicode
parentdcdadc2702712fa750ed255ed1dfa354522797a0 (diff)
downloadntapi-dd89bb8ad4fe184a34b5dbdda237e640fc82121b.tar.bz2
ntapi-dd89bb8ad4fe184a34b5dbdda237e640fc82121b.tar.xz
entered advanced internal development stage.
Diffstat (limited to 'src/unicode')
-rw-r--r--src/unicode/ntapi_uc_unicode_conversion_from_utf16.c287
-rw-r--r--src/unicode/ntapi_uc_unicode_conversion_from_utf8.c288
-rw-r--r--src/unicode/ntapi_uc_unicode_validation.c329
3 files changed, 904 insertions, 0 deletions
diff --git a/src/unicode/ntapi_uc_unicode_conversion_from_utf16.c b/src/unicode/ntapi_uc_unicode_conversion_from_utf16.c
new file mode 100644
index 0000000..102a24d
--- /dev/null
+++ b/src/unicode/ntapi_uc_unicode_conversion_from_utf16.c
@@ -0,0 +1,287 @@
+/********************************************************/
+/* ntapi: Native API core library */
+/* Copyright (C) 2013,2014,2015 Z. Gilboa */
+/* Released under GPLv2 and GPLv3; see COPYING.NTAPI. */
+/********************************************************/
+
+#include <psxtypes/psxtypes.h>
+#include <ntapi/nt_status.h>
+#include <ntapi/nt_unicode.h>
+#include "ntapi_impl.h"
+
+
+static int32_t __fastcall __utf16_to_utf8_handler_1byte_or_null_termination(nt_utf16_callback_args * args)
+{
+ /*******************************************/
+ /* from: 00000000 0xxxxxxx (little endian) */
+ /* to: 0xxxxxxx (utf-8) */
+ /*******************************************/
+
+ uint8_t * dst;
+
+ if (args->dst >= args->dst_cap)
+ return NT_STATUS_BUFFER_TOO_SMALL;
+
+ dst = (uint8_t *)args->dst;
+ *dst = *(uint8_t *)(args->src);
+
+ /* advance source and destination buffer */
+ args->src++;
+ args->dst = (void *)((uintptr_t)(args->dst) + 1);
+
+ /* bytes_written */
+ args->bytes_written++;
+
+ return NT_STATUS_SUCCESS;
+}
+
+
+static int32_t __fastcall __utf16_to_utf8_handler_2bytes(nt_utf16_callback_args * args)
+{
+ /*******************************************/
+ /* from: 00000yyy yyxxxxxx (little endian) */
+ /* to: 110yyyyy 10xxxxxx (utf-8) */
+ /*******************************************/
+
+ const wchar16_t * src;
+ uint8_t * dst;
+
+ wchar16_t wx;
+ wchar16_t wy;
+
+ if ((uintptr_t)(args->dst) + 1 >= (uintptr_t)(args->dst_cap))
+ return NT_STATUS_BUFFER_TOO_SMALL;
+
+ src = args->src;
+ dst = (uint8_t *)args->dst;
+
+ wy = *src;
+ wy >>= 6;
+
+ wx = *src;
+ wx <<= 10;
+ wx >>= 10;
+
+ /* write the y part */
+ *dst = (char)(0xC0 | wy);
+ dst++;
+
+ /* write the x part */
+ *dst = (char)(0x80 | wx);
+
+ /* advance source and destination buffer */
+ args->src++;
+ args->dst = (void *)((uintptr_t)(args->dst) + 2);
+
+ /* bytes_written */
+ args->bytes_written += 2;
+
+ return NT_STATUS_SUCCESS;
+}
+
+
+static int32_t __fastcall __utf16_to_utf8_handler_3bytes(nt_utf16_callback_args * args)
+{
+ /********************************************/
+ /* from: zzzzyyyy yyxxxxxx (little endian) */
+ /* to: 1110zzzz 10yyyyyy 10xxxxxx (utf-8) */
+ /********************************************/
+
+ const wchar16_t * src;
+ uint8_t * dst;
+
+ wchar16_t wx;
+ wchar16_t wy;
+ wchar16_t wz;
+
+ if ((uintptr_t)(args->dst) + 2 >= (uintptr_t)(args->dst_cap))
+ return NT_STATUS_BUFFER_TOO_SMALL;
+
+ src = args->src;
+ dst = (uint8_t *)args->dst;
+
+ wz = *src;
+ wz >>= 12;
+
+ wy = *src;
+ wy <<= 4;
+ wy >>= 10;
+
+ wx = *src;
+ wx <<= 10;
+ wx >>= 10;
+
+ /* write the z part */
+ *dst = (char)(0xE0 | wz);
+ dst++;
+
+ /* write the y part */
+ *dst = (char)(0x80 | wy);
+ dst++;
+
+ /* write the x part */
+ *dst = (char)(0x80 | wx);
+
+ /* advance source and destination buffer */
+ args->src++;
+ args->dst = (void *)((uintptr_t)(args->dst) + 3);
+
+ /* bytes_written */
+ args->bytes_written += 3;
+
+ return NT_STATUS_SUCCESS;
+}
+
+
+static int32_t __fastcall __utf16_to_utf8_handler_4bytes(nt_utf16_callback_args * args)
+{
+ /****************************************************************/
+ /* from: 110110ww wwzzzzyy 110111yy yyxxxxxx (little endian) */
+ /* to: 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx (utf-8) */
+ /****************************************************************/
+
+ const wchar16_t * src;
+ uint8_t * dst;
+
+ wchar16_t wx;
+ wchar16_t wz;
+
+ wchar16_t wy_low;
+ wchar16_t wy_high;
+ wchar16_t ww;
+ wchar16_t uuuuu;
+ wchar16_t u_low;
+ wchar16_t u_high;
+
+ if ((uintptr_t)(args->dst) + 3 >= (uintptr_t)(args->dst_cap))
+ return NT_STATUS_BUFFER_TOO_SMALL;
+
+ src = args->src;
+ dst = (uint8_t *)args->dst;
+
+ /* low two bytes */
+ wx = *src;
+ wx <<= 10;
+ wx >>= 10;
+
+ wy_low = *src;
+ wy_low <<= 6;
+ wy_low >>= 12;
+
+ /* (surrogate pair) */
+ src++;
+
+ /* high two bytes */
+ wy_high = *src;
+ wy_high <<= 14;
+ wy_high >>= 10;
+
+ wz = *src;
+ wz <<= 10;
+ wz >>= 12;
+ wz <<= 2;
+
+ ww = *src;
+ ww <<= 6;
+ ww >>= 12;
+
+ uuuuu = ww + 1;
+ u_high = uuuuu >> 2;
+ u_low = ((uuuuu << 14) >> 10);
+
+ /* 1st byte: 11110uuu */
+ *dst = (char)(0xF0 | u_high);
+ dst++;
+
+ /* 2nd byte: 10uuzzzz */
+ *dst = (char)(0x80 | u_low | wz);
+ dst++;
+
+ /* 3rd byte: 10yyyyyy */
+ *dst = (char)(0x80 | wy_low | wy_high);
+ dst++;
+
+ /* 4th byte: 10xxxxxx */
+ *dst = (char)(0x80 | wx);
+
+ /* advance source and destination buffer */
+ args->src += 2;
+ args->dst = (void *)((uintptr_t)(args->dst) + 4);
+
+ /* bytes_written */
+ args->bytes_written += 4;
+
+ return NT_STATUS_SUCCESS;
+}
+
+
+static int32_t __fastcall __update_stream_leftover_info_utf16(
+ __in_out nt_unicode_conversion_params_utf16_to_utf8 * params)
+{
+ int32_t status;
+ ptrdiff_t offset;
+ wchar16_t * wlead;
+
+ offset = (uintptr_t)params->src + (uintptr_t)params->src_size_in_bytes - (uintptr_t)params->addr_failed;
+ wlead = (wchar16_t *)params->addr_failed;
+
+
+ if ((offset == 2) && (*wlead >= 0xD800) && (*wlead < 0xDC00)) {
+ /* possibly the lead of a surrogate pair lead */
+ params->leftover_count = 2;
+ params->leftover_bytes = *wlead;
+ params->leftover_bytes <<= 16;
+ status = NT_STATUS_SUCCESS;
+ } else {
+ params->leftover_count = 0;
+ params->leftover_bytes = 0;
+ status = NT_STATUS_ILLEGAL_CHARACTER;
+ }
+
+ return status;
+}
+
+
+int32_t __stdcall __ntapi_uc_convert_unicode_stream_utf16_to_utf8(
+ __in_out nt_unicode_conversion_params_utf16_to_utf8 * params)
+{
+ int32_t status;
+ nt_utf16_callback_args args;
+ ntapi_uc_utf16_callback_fn * callback_fn[5];
+
+ callback_fn[0] = (ntapi_uc_utf16_callback_fn *)__utf16_to_utf8_handler_1byte_or_null_termination;
+ callback_fn[1] = (ntapi_uc_utf16_callback_fn *)__utf16_to_utf8_handler_1byte_or_null_termination;
+ callback_fn[2] = (ntapi_uc_utf16_callback_fn *)__utf16_to_utf8_handler_2bytes;
+ callback_fn[3] = (ntapi_uc_utf16_callback_fn *)__utf16_to_utf8_handler_3bytes;
+ callback_fn[4] = (ntapi_uc_utf16_callback_fn *)__utf16_to_utf8_handler_4bytes;
+
+ args.src = params->src;
+ args.dst = params->dst;
+ args.dst_cap = (void *)((uintptr_t)(params->dst) + (params->dst_size_in_bytes));
+ args.bytes_written = params->bytes_written;
+
+ status = __ntapi_uc_validate_unicode_stream_utf16(
+ params->src,
+ params->src_size_in_bytes,
+ &params->code_points,
+ &params->addr_failed,
+ callback_fn,
+ &args);
+
+ params->bytes_written = args.bytes_written;
+
+ if (status)
+ status = __update_stream_leftover_info_utf16(params);
+
+ /* the following bit shift will be optimized out on 32-bit architectures */
+ params->leftover_bytes <<= (8 * (sizeof(uintptr_t) - sizeof(uint32_t)));
+
+ return status;
+}
+
+
+int32_t __stdcall __ntapi_uc_convert_unicode_stream_utf16_to_utf32(
+ __in_out nt_unicode_conversion_params_utf16_to_utf32 * params)
+{
+ return NT_STATUS_SUCCESS;
+}
diff --git a/src/unicode/ntapi_uc_unicode_conversion_from_utf8.c b/src/unicode/ntapi_uc_unicode_conversion_from_utf8.c
new file mode 100644
index 0000000..02976ea
--- /dev/null
+++ b/src/unicode/ntapi_uc_unicode_conversion_from_utf8.c
@@ -0,0 +1,288 @@
+/********************************************************/
+/* ntapi: Native API core library */
+/* Copyright (C) 2013,2014,2015 Z. Gilboa */
+/* Released under GPLv2 and GPLv3; see COPYING.NTAPI. */
+/********************************************************/
+
+#include <psxtypes/psxtypes.h>
+#include <ntapi/nt_status.h>
+#include <ntapi/nt_unicode.h>
+#include "ntapi_impl.h"
+
+
+typedef struct ___two_bytes {
+ unsigned char low;
+ unsigned char high;
+} __two_bytes;
+
+
+typedef struct ___three_bytes {
+ unsigned char low;
+ unsigned char middle;
+ unsigned char high;
+} __three_bytes;
+
+
+static int32_t __fastcall __utf8_to_utf16_handler_1byte_or_null_termination(nt_utf8_callback_args * args)
+{
+ /***************************/
+ /* from: 0xxxxxxx */
+ /* to: 00000000 0xxxxxxx */
+ /***************************/
+
+ wchar16_t * dst;
+
+ if (args->dst >= args->dst_cap)
+ return NT_STATUS_BUFFER_TOO_SMALL;
+
+ dst = (wchar16_t *)args->dst;
+ *dst = *(args->src);
+
+ /* advance source and destination buffer */
+ args->src++;
+ args->dst = (void *)((uintptr_t)(args->dst) + sizeof(wchar16_t));
+
+ /* bytes_written */
+ args->bytes_written += sizeof(wchar16_t);
+
+ return NT_STATUS_SUCCESS;
+}
+
+
+static int32_t __fastcall __utf8_to_utf16_handler_2bytes(nt_utf8_callback_args * args)
+{
+ /***************************/
+ /* from: 110yyyyy 10xxxxxx */
+ /* to: 00000yyy yyxxxxxx */
+ /***************************/
+
+ __two_bytes * src; /* big endian */
+ wchar16_t * dst;
+
+ if (args->dst >= args->dst_cap)
+ return NT_STATUS_BUFFER_TOO_SMALL;
+
+ src = (__two_bytes *)args->src;
+ dst = (wchar16_t *)args->dst;
+
+ /* yyyyy */
+ *dst = (src->low ^ 0xC0);
+ *dst <<= 6;
+
+ /* xxxxxx */
+ *dst |= (src->high ^ 0x80);
+
+ /* advance source and destination buffer */
+ args->src += 2;
+ args->dst = (void *)((uintptr_t)(args->dst) + sizeof(wchar16_t));
+
+ /* bytes_written */
+ args->bytes_written += sizeof(wchar16_t);
+
+ return NT_STATUS_SUCCESS;
+}
+
+
+static int32_t __fastcall __utf8_to_utf16_handler_3bytes(nt_utf8_callback_args * args)
+{
+ /************************************/
+ /* from: 1110zzzz 10yyyyyy 10xxxxxx */
+ /* to: zzzzyyyy yyxxxxxx */
+ /************************************/
+
+ __three_bytes * src; /* big endian */
+ wchar16_t * dst;
+ wchar16_t yyyyy;
+
+ if (args->dst >= args->dst_cap)
+ return NT_STATUS_BUFFER_TOO_SMALL;
+
+ src = (__three_bytes *)args->src;
+ dst = (wchar16_t *)args->dst;
+
+ /* zzzz */
+ *dst = (src->low ^ 0xE0);
+ *dst <<= 12;
+
+ /* yyyyy */
+ yyyyy = (src->middle ^ 0x80);
+ yyyyy <<= 6;
+ *dst |= yyyyy;
+
+ /* xxxxxx */
+ *dst |= (src->high ^ 0x80);
+
+ /* advance source and destination buffer */
+ args->src += 3;
+ args->dst = (void *)((uintptr_t)(args->dst) + sizeof(wchar16_t));
+
+ /* bytes_written */
+ args->bytes_written += sizeof(wchar16_t);
+
+ return NT_STATUS_SUCCESS;
+}
+
+
+static int32_t __fastcall __utf8_to_utf16_handler_4bytes(nt_utf8_callback_args * args)
+{
+ /*************************************************/
+ /* from: 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx */
+ /* to: 110110ww wwzzzzyy 110111yy yyxxxxxx */
+ /*************************************************/
+
+ __two_bytes * src_low; /* big endian */
+ __two_bytes * src_high; /* big endian */
+ wchar16_t * dst_lead;
+ wchar16_t * dst_trail;
+
+ wchar16_t u;
+ unsigned char ulow;
+ unsigned char uhigh;
+ unsigned char yyyy;
+
+ dst_lead = dst_trail = (wchar16_t *)args->dst;
+ dst_trail++;
+
+ if ((uintptr_t)dst_trail >= (uintptr_t)args->dst_cap)
+ return NT_STATUS_BUFFER_TOO_SMALL;
+
+ src_low = src_high = (__two_bytes *)args->src;
+ src_high++;
+
+ /* u */
+ ulow = src_low->low ^ 0xF0;
+ uhigh = src_low->high ^ 0x80;
+
+ ulow <<= 2;
+ uhigh >>= 4;
+
+ u = ulow | uhigh;
+
+ /* 110110ww wwzzzzyy */
+ *dst_lead = 0xD800;
+ *dst_lead |= ((u-1) << 6);
+ *dst_lead |= ((src_low->high ^ 0x80) << 2);
+ *dst_lead |= ((src_high->low ^ 0x80) >> 4);
+
+ /* 110111yy yyxxxxxx */
+ yyyy = (src_high->low << 4);
+ *dst_trail = yyyy;
+ *dst_trail <<= 2;
+ *dst_trail |= (src_high->high ^ 0x80);
+ *dst_trail |= 0xDC00;
+
+ /* advance source and destination buffer */
+ args->src += 4;
+ args->dst = (void *)((uintptr_t)(args->dst) + (2 * sizeof(wchar16_t)));
+
+ /* bytes_written */
+ args->bytes_written += 2 * sizeof(wchar16_t);
+
+ return NT_STATUS_SUCCESS;
+}
+
+
+static int32_t __fastcall __update_stream_leftover_info_utf8(
+ __in_out nt_unicode_conversion_params_utf8_to_utf16 * params)
+{
+ int32_t status;
+ ptrdiff_t offset;
+ unsigned char * utf8;
+
+ offset = (uintptr_t)params->src + (uintptr_t)params->src_size_in_bytes - (uintptr_t)params->addr_failed;
+ utf8 = (unsigned char *)params->addr_failed;
+
+ /* default status */
+ status = NT_STATUS_ILLEGAL_CHARACTER;
+
+ if (offset == 1) {
+ if ((utf8[0] >= 0xC2) && (utf8[0] <= 0xF4)) {
+ /* one leftover byte */
+ params->leftover_count = 1;
+ params->leftover_bytes = utf8[0];
+ params->leftover_bytes <<= 24;
+ status = NT_STATUS_SUCCESS;
+ }
+ } else if (offset == 2) {
+ if /* ------- */ (((utf8[0] == 0xE0) && (utf8[1] >= 0xA0) && (utf8[1] <= 0xBF))
+ || ((utf8[0] >= 0xE1) && (utf8[0] <= 0xEC) && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF))
+ || ((utf8[0] == 0xED) && (utf8[1] >= 0x80) && (utf8[1] <= 0x9F))
+ || ((utf8[0] >= 0xEE) && (utf8[0] <= 0xEF) && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF))
+ || ((utf8[0] == 0xF0) && (utf8[1] >= 0x90) && (utf8[1] <= 0xBF))
+ || ((utf8[0] >= 0xF1) && (utf8[0] <= 0xF3) && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF))
+ || ((utf8[0] == 0xF4) && (utf8[1] >= 0x80) && (utf8[1] <= 0x8F))) {
+ /* two leftover bytes */
+ params->leftover_count = 2;
+ params->leftover_bytes = utf8[0];
+ params->leftover_bytes <<= 8;
+ params->leftover_bytes += utf8[1];
+ params->leftover_bytes <<= 16;
+ status = NT_STATUS_SUCCESS;
+ }
+ } else if (offset == 3) {
+ if /* ------- */ (((utf8[0] == 0xF0) && (utf8[1] >= 0x90) && (utf8[1] <= 0xBF))
+ || ((utf8[0] >= 0xF1) && (utf8[0] <= 0xF3) && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF))
+ || ((utf8[0] == 0xF4) && (utf8[1] >= 0x80) && (utf8[1] <= 0x8F))) {
+ /* three leftover bytes */
+ params->leftover_count = 3;
+ params->leftover_bytes = utf8[0];
+ params->leftover_bytes <<= 8;
+ params->leftover_bytes += utf8[1];
+ params->leftover_bytes <<= 8;
+ params->leftover_bytes += utf8[2];
+ params->leftover_bytes <<= 8;
+ status = NT_STATUS_SUCCESS;
+ }
+ }
+
+ if (status != NT_STATUS_SUCCESS) {
+ params->leftover_count = 0;
+ params->leftover_bytes = 0;
+ }
+
+ return status;
+}
+
+int32_t __stdcall __ntapi_uc_convert_unicode_stream_utf8_to_utf16(
+ __in_out nt_unicode_conversion_params_utf8_to_utf16 * params)
+{
+ int32_t status;
+ nt_utf8_callback_args args;
+ ntapi_uc_utf8_callback_fn * callback_fn[5];
+
+ callback_fn[0] = (ntapi_uc_utf8_callback_fn *)__utf8_to_utf16_handler_1byte_or_null_termination;
+ callback_fn[1] = (ntapi_uc_utf8_callback_fn *)__utf8_to_utf16_handler_1byte_or_null_termination;
+ callback_fn[2] = (ntapi_uc_utf8_callback_fn *)__utf8_to_utf16_handler_2bytes;
+ callback_fn[3] = (ntapi_uc_utf8_callback_fn *)__utf8_to_utf16_handler_3bytes;
+ callback_fn[4] = (ntapi_uc_utf8_callback_fn *)__utf8_to_utf16_handler_4bytes;
+
+ args.src = params->src;
+ args.dst = params->dst;
+ args.dst_cap = (void *)((uintptr_t)(params->dst) + (params->dst_size_in_bytes));
+ args.bytes_written = params->bytes_written;
+
+ status = __ntapi_uc_validate_unicode_stream_utf8(
+ params->src,
+ params->src_size_in_bytes,
+ &params->code_points,
+ &params->addr_failed,
+ callback_fn,
+ &args);
+
+ params->bytes_written = args.bytes_written;
+
+ if (status != NT_STATUS_SUCCESS)
+ status = __update_stream_leftover_info_utf8(params);
+
+ /* (optimized out on 32-bit architectures) */
+ params->leftover_bytes <<= (8 * (sizeof(uintptr_t) - sizeof(uint32_t)));
+
+ return status;
+}
+
+
+int32_t __stdcall __ntapi_uc_convert_unicode_stream_utf8_to_utf32(
+ __in_out nt_unicode_conversion_params_utf8_to_utf32 * params)
+{
+ return NT_STATUS_SUCCESS;
+}
diff --git a/src/unicode/ntapi_uc_unicode_validation.c b/src/unicode/ntapi_uc_unicode_validation.c
new file mode 100644
index 0000000..4c6fcac
--- /dev/null
+++ b/src/unicode/ntapi_uc_unicode_validation.c
@@ -0,0 +1,329 @@
+/********************************************************/
+/* ntapi: Native API core library */
+/* Copyright (C) 2013,2014,2015 Z. Gilboa */
+/* Released under GPLv2 and GPLv3; see COPYING.NTAPI. */
+/********************************************************/
+
+#include <psxtypes/psxtypes.h>
+#include <ntapi/nt_status.h>
+#include <ntapi/nt_unicode.h>
+
+/**
+ * unofficial bit distribution table for comprehension purposes only
+ *
+ * scalar nickname utf-16 utf-8[0] utf-8[1] utf-8[2] utf-8[3]
+ * ------ -------- -------- -------- -------- -------- --------
+ * 00000000 7x 00000000 0xxxxxxx
+ * 0xxxxxxx 0xxxxxxx
+ *
+ * 00000yyy 5y6x 00000yyy 110yyyyy 10xxxxxx
+ * yyxxxxxx yyxxxxxx
+ *
+ * zzzzyyyy 4z6y6x zzzzyyyy 1110zzzz 10yyyyyy 10xxxxxx
+ * yyxxxxxx yyxxxxxx
+ *
+ * 000uuuuu 5u4z6y6x 110110ww 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
+ * zzzzyyyy wwzzzzyy
+ * yyxxxxxx 110111yy
+ * yyxxxxxx (where wwww = uuuuu - 1)
+ *
+ *
+ * validation of utf-8
+ *
+ * from to utf-8[0] utf-8[1] utf-8[2] utf-8[3]
+ * ------ ------ -------- -------- -------- --------
+ * 0x0000 0x007F 00..7F
+ * 0x0080 0x07FF C2..DF 80..BF
+ * 0x0800 0x0FFF E0 A0..BF 80..BF
+ * 0x1000 0xCFFF E1..EC 80..BF 80..BF
+ * 0xD000 0xD7FF ED 80..9F 80..BF
+ * 0xE000 0xFFFF EE..EF 80..BF 80..BF
+ * 0x10000 0x3FFFF F0 90..BF 80..BF 80..BF
+ * 0x40000 0xFFFFF F1..F3 80..BF 80..BF 80..BF
+ * 0x100000 0x10FFFF F4 80..8F 80..BF 80..BF
+ *
+**/
+
+
+#define __AVAILABLE_CODE_POINTS 0x110000
+
+int __stdcall __ntapi_uc_get_code_point_byte_count_utf8(uint32_t code_point)
+{
+ /* try clearing 7x bits */
+ if ((code_point >> 7) == 0)
+ return 1;
+
+ /* try clearing 5y + 6x bits */
+ else if ((code_point >> 11) == 0)
+ return 2;
+
+ /* try clearing 4z +6y + 6x bits */
+ else if ((code_point >> 16) == 0)
+ return 3;
+
+ /* try clearing 5u + 4z + 6y + 6x bits */
+ else if ((code_point >> 21) == 0)
+ return 4;
+
+ /* __AVAILABLE_CODE_POINTS exceeded */
+ else
+ return 0;
+}
+
+
+int __stdcall __ntapi_uc_get_code_point_byte_count_utf16(uint32_t code_point)
+{
+ /* try clearing 4z +6y + 6x bits */
+ if ((code_point >> 16) == 0)
+ return 2;
+
+ /* try clearing 5u + 4z + 6y + 6x bits */
+ else if ((code_point >> 21) == 0)
+ return 4;
+
+ /* __AVAILABLE_CODE_POINTS exceeded */
+ else
+ return 0;
+}
+
+
+/**
+ * following is a straight-forward implementation
+ * of unicode conversion and validation (see also:
+ * Table 3-7 of the Unicode Standard, version 6.2).
+ *
+ * the use of callbacks allows the validation
+ * functions to be the basis of our utf-8 conversion
+ * functions on the one hand, and the posix path arg
+ * normalization routine on the other.
+**/
+
+static int32_t __fastcall __default_callback_fn_utf8(nt_utf8_callback_args * args)
+{
+ args->src += args->byte_count;
+ return NT_STATUS_SUCCESS;
+}
+
+int32_t __stdcall __ntapi_uc_validate_unicode_stream_utf8(
+ __in const unsigned char * ch,
+ __in size_t size_in_bytes __optional,
+ __out size_t * code_points __optional,
+ __out void ** addr_failed __optional,
+ __in ntapi_uc_utf8_callback_fn ** callback_fn __optional,
+ __in nt_utf8_callback_args * callback_args __optional)
+{
+ const unsigned char * utf8;
+ unsigned char * ch_boundary;
+ unsigned char byte_count;
+ size_t _code_points;
+
+ ntapi_uc_utf8_callback_fn * _callback_fn[5];
+ nt_utf8_callback_args _callback_args;
+
+ if (!callback_fn) {
+ _callback_fn[0] = __default_callback_fn_utf8;
+ _callback_fn[1] = __default_callback_fn_utf8;
+ _callback_fn[2] = __default_callback_fn_utf8;
+ _callback_fn[3] = __default_callback_fn_utf8;
+ _callback_fn[4] = __default_callback_fn_utf8;
+ callback_fn = (ntapi_uc_utf8_callback_fn **)&_callback_fn;
+ }
+
+ if (!callback_args) {
+ callback_args = &_callback_args;
+ callback_args->src = (unsigned char *)0;
+ }
+
+ if (callback_args->src)
+ ch = callback_args->src;
+ else
+ callback_args->src = ch;
+
+ if (size_in_bytes)
+ ch_boundary = (unsigned char *)((uintptr_t)ch + size_in_bytes);
+ else
+ ch_boundary = (unsigned char *)(~0);
+
+ if (!code_points)
+ code_points = &_code_points;
+
+ while ((ch < ch_boundary) && (*ch)) {
+ utf8 = ch;
+ byte_count = 0;
+
+ /* try one byte */
+ if (utf8[0] <= 0x7F)
+ byte_count = 1;
+
+ /* try two bytes */
+ else if ((++ch < ch_boundary)
+ && (utf8[0] >= 0xC2) && (utf8[0] <= 0xDF)
+ && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF))
+ byte_count = 2;
+
+ /* try three bytes */
+ else if ((++ch < ch_boundary)
+ && (utf8[0] == 0xE0)
+ && (utf8[1] >= 0xA0) && (utf8[1] <= 0xBF)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
+ byte_count = 3;
+
+ else if (
+ (utf8[0] >= 0xE1) && (utf8[0] <= 0xEC)
+ && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
+ byte_count = 3;
+
+ else if (
+ (utf8[0] == 0xED)
+ && (utf8[1] >= 0x80) && (utf8[1] <= 0x9F)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
+ byte_count = 3;
+
+ else if (
+ (utf8[0] >= 0xEE) && (utf8[0] <= 0xEF)
+ && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
+ byte_count = 3;
+
+ /* try four bytes */
+ else if ((++ch < ch_boundary)
+ && (utf8[0] == 0xF0)
+ && (utf8[1] >= 0x90) && (utf8[1] <= 0xBF)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
+ && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
+ byte_count = 4;
+
+ else if (
+ (utf8[0] >= 0xF1) && (utf8[0] <= 0xF3)
+ && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
+ && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
+ byte_count = 4;
+
+ else if (
+ (utf8[0] == 0xF4)
+ && (utf8[1] >= 0x80) && (utf8[1] <= 0x8F)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
+ && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
+ byte_count = 4;
+
+ if (byte_count) {
+ (*code_points)++;
+ callback_args->byte_count = byte_count;
+ callback_fn[byte_count](callback_args);
+ } else {
+ if (addr_failed)
+ *addr_failed = (void *)utf8;
+ return NT_STATUS_ILLEGAL_CHARACTER;
+ }
+
+ /* advance, transcode if needed */
+ ch = callback_args->src;
+ }
+
+ if ((ch < ch_boundary) && (*ch == 0))
+ callback_fn[0](callback_args);
+
+ return NT_STATUS_SUCCESS;
+}
+
+
+static int32_t __fastcall __default_callback_fn_utf16(nt_utf16_callback_args * args)
+{
+ if (args->byte_count == 4)
+ args->src += 2;
+ else
+ args->src++;
+
+ return NT_STATUS_SUCCESS;
+}
+
+
+int32_t __stdcall __ntapi_uc_validate_unicode_stream_utf16(
+ __in const wchar16_t * wch,
+ __in size_t size_in_bytes __optional,
+ __out size_t * code_points __optional,
+ __out void ** addr_failed __optional,
+ __in ntapi_uc_utf16_callback_fn ** callback_fn __optional,
+ __in nt_utf16_callback_args * callback_args __optional)
+{
+ const wchar16_t * wch_trail;
+ wchar16_t * wch_boundary;
+ unsigned char byte_count;
+ size_t _code_points;
+
+ ntapi_uc_utf16_callback_fn * _callback_fn[5];
+ nt_utf16_callback_args _callback_args;
+
+ if (!callback_fn) {
+ _callback_fn[0] = __default_callback_fn_utf16;
+ _callback_fn[1] = __default_callback_fn_utf16;
+ _callback_fn[2] = __default_callback_fn_utf16;
+ _callback_fn[3] = __default_callback_fn_utf16;
+ _callback_fn[4] = __default_callback_fn_utf16;
+ callback_fn = (ntapi_uc_utf16_callback_fn **)&_callback_fn;
+ }
+
+ if (!callback_args) {
+ callback_args = &_callback_args;
+ callback_args->src = (wchar16_t *)0;
+ }
+
+ if (callback_args->src)
+ wch = callback_args->src;
+ else
+ callback_args->src = wch;
+
+ if (size_in_bytes)
+ wch_boundary = (wchar16_t *)((uintptr_t)wch + size_in_bytes);
+ else
+ wch_boundary = (wchar16_t *)(~0);
+
+ if (!code_points)
+ code_points = &_code_points;
+
+ while ((wch < wch_boundary) && (*wch)) {
+ byte_count = 0;
+
+ /* try one byte */
+ if (*wch <= 0x7F)
+ byte_count = 1;
+
+ /* try two bytes */
+ else if (*wch <= 0x7FF)
+ byte_count = 2;
+
+ /* try three bytes */
+ else if ((*wch < 0xD800) || (*wch >= 0xE000))
+ byte_count = 3;
+
+ /* try four bytes */
+ else if ((*wch >= 0xD800) && (*wch < 0xDC00)) {
+ wch_trail = wch + 1;
+
+ if ((wch_trail < wch_boundary)
+ && (*wch_trail >= 0xDC00)
+ && (*wch_trail < 0xE000))
+ byte_count = 4;
+ }
+
+ if (byte_count) {
+ (*code_points)++;
+ callback_args->byte_count = byte_count;
+ callback_fn[byte_count](callback_args);
+ } else {
+ if (addr_failed)
+ *addr_failed = (void *)wch;
+ return NT_STATUS_ILLEGAL_CHARACTER;
+ }
+
+ /* advance, transcode as needed */
+ wch = callback_args->src;
+ }
+
+ if ((wch < wch_boundary) && (*wch == 0))
+ callback_fn[0](callback_args);
+
+ return NT_STATUS_SUCCESS;
+}