summaryrefslogtreecommitdiffhomepage
path: root/src/argv
diff options
context:
space:
mode:
authormidipix <writeonce@midipix.org>2023-01-11 19:12:40 +0000
committermidipix <writeonce@midipix.org>2023-01-13 08:25:24 +0000
commit4cd40123d8217d20824a4323aab56eaae27a373b (patch)
tree0c8a91eff59204cbfdf5a2188e2aa532fcdfe5da /src/argv
parent345a52dd5b7a6a8a6f170bd6352b55a77e68c39d (diff)
downloadntapi-4cd40123d8217d20824a4323aab56eaae27a373b.tar.bz2
ntapi-4cd40123d8217d20824a4323aab56eaae27a373b.tar.xz
__ntapi_tt_array_convert_utf8_to_utf16(): initial implementation.
Diffstat (limited to 'src/argv')
-rw-r--r--src/argv/ntapi_tt_array_utf8.c250
1 files changed, 241 insertions, 9 deletions
diff --git a/src/argv/ntapi_tt_array_utf8.c b/src/argv/ntapi_tt_array_utf8.c
index b1c95b5..a2f6d0f 100644
--- a/src/argv/ntapi_tt_array_utf8.c
+++ b/src/argv/ntapi_tt_array_utf8.c
@@ -10,6 +10,137 @@
#include <ntapi/ntapi.h>
#include "ntapi_impl.h"
+typedef struct ___two_bytes {
+ unsigned char low;
+ unsigned char high;
+} __two_bytes;
+
+
+typedef struct ___three_bytes {
+ unsigned char low;
+ unsigned char middle;
+ unsigned char high;
+} __three_bytes;
+
+static void __utf8_to_utf16_handler_1byte_or_null_termination(wchar16_t * dst, const unsigned char * ch)
+{
+ /***************************/
+ /* from: 0xxxxxxx */
+ /* to: 00000000 0xxxxxxx */
+ /***************************/
+
+ *dst = *ch;
+}
+
+
+static void __utf8_to_utf16_handler_2bytes(wchar16_t * dst, const unsigned char * ch)
+{
+ /***************************/
+ /* from: 110yyyyy 10xxxxxx */
+ /* to: 00000yyy yyxxxxxx */
+ /***************************/
+
+ __two_bytes * src; /* big endian */
+
+ src = (__two_bytes *)ch;
+
+ /* yyyyy */
+ *dst = (src->low ^ 0xC0);
+ *dst <<= 6;
+
+ /* xxxxxx */
+ *dst |= (src->high ^ 0x80);
+}
+
+
+static void __utf8_to_utf16_handler_3bytes(wchar16_t * dst, const unsigned char * ch)
+{
+ /************************************/
+ /* from: 1110zzzz 10yyyyyy 10xxxxxx */
+ /* to: zzzzyyyy yyxxxxxx */
+ /************************************/
+
+ __three_bytes * src; /* big endian */
+ wchar16_t yyyyy;
+
+ src = (__three_bytes *)ch;
+
+ /* zzzz */
+ *dst = (src->low ^ 0xE0);
+ *dst <<= 12;
+
+ /* yyyyy */
+ yyyyy = (src->middle ^ 0x80);
+ yyyyy <<= 6;
+ *dst |= yyyyy;
+
+ /* xxxxxx */
+ *dst |= (src->high ^ 0x80);
+}
+
+
+static void __utf8_to_utf16_handler_4bytes(wchar16_t * dst, const unsigned char * ch)
+{
+ /*************************************************/
+ /* from: 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx */
+ /* to: 110110ww wwzzzzyy 110111yy yyxxxxxx */
+ /*************************************************/
+
+ __two_bytes * src_low; /* big endian */
+ __two_bytes * src_high; /* big endian */
+ wchar16_t * dst_lead;
+ wchar16_t * dst_trail;
+
+ wchar16_t wwww;
+ wchar16_t lead;
+ wchar16_t trail;
+ unsigned char ulow;
+ unsigned char uhigh;
+ unsigned char yy;
+ unsigned char yyyy;
+ unsigned char zzzz;
+
+ dst_lead = dst_trail = (wchar16_t *)dst;
+ dst_trail++;
+
+ src_low = src_high = (__two_bytes *)ch;
+ src_high++;
+
+ /* uuuuu */
+ ulow = src_low->low ^ 0xF0;
+ uhigh = src_low->high ^ 0x80;
+
+ ulow <<= 2;
+ uhigh >>= 4;
+
+ /* wwww */
+ wwww = (ulow | uhigh) - 1;
+ wwww <<= 6;
+
+ /* 110110ww wwzzzzyy */
+ yy = src_high->low ^ 0x80;
+ yy >>= 4;
+
+ zzzz = src_low->high;
+ zzzz <<= 4;
+ zzzz >>= 2;
+
+ lead = 0xD800;
+ lead |= wwww;
+ lead |= zzzz;
+ lead |= yy;
+
+ /* 110111yy yyxxxxxx */
+ yyyy = src_high->low << 4;
+ trail = yyyy << 2;
+ trail |= src_high->high ^ 0x80;
+ trail |= 0xDC00;
+
+ /* write */
+ *dst_lead = lead;
+ *dst_trail = trail;
+}
+
int32_t __stdcall __ntapi_tt_array_copy_utf8(
__out int * argc,
__in const char ** argv,
@@ -140,21 +271,122 @@ int32_t __stdcall __ntapi_tt_array_copy_utf8(
return NT_STATUS_SUCCESS;
}
+static void (*__utf8_to_utf16_handlers[5])(wchar16_t *, const unsigned char *) = {
+ 0,
+ __utf8_to_utf16_handler_1byte_or_null_termination,
+ __utf8_to_utf16_handler_2bytes,
+ __utf8_to_utf16_handler_3bytes,
+ __utf8_to_utf16_handler_4bytes};
+
int32_t __stdcall __ntapi_tt_array_convert_utf8_to_utf16(
__in char ** arrv,
- __in wchar16_t ** arra,
- __in void * base,
- __in wchar16_t * buffer,
+ __out wchar16_t ** warrv,
+ __out void * base,
+ __out wchar16_t * buffer,
__in size_t buffer_len,
__out size_t * bytes_written)
{
- (void)arrv;
- (void)arra;
- (void)base;
- (void)buffer;
- (void)buffer_len;
+ wchar16_t * ubound;
+ wchar16_t * wch;
+ ptrdiff_t diff;
+ ptrdiff_t wdiff;
+ char * ch;
+ const uint8_t * utf8;
+ uint8_t byte_count;
+
+ if ((uintptr_t)base % sizeof(wchar16_t))
+ return NT_STATUS_INVALID_PARAMETER_3;
+
+ wch = buffer;
+ diff = (ptrdiff_t)base;
+ wdiff = (ptrdiff_t)base / sizeof(wchar16_t);
+
+ ubound = buffer;
+ ubound += buffer_len / sizeof(wchar16_t);
+ ubound--;
+ ubound--;
+ ubound--;
+
+ for (; arrv && *arrv; arrv++,warrv++) {
+ *warrv = wch - wdiff;
+ ch = *arrv + diff;
+
+ /* ubound already accounts for null termination, see above */
+ for (; *ch && (wch<ubound); ch++) {
+ utf8 = (const uint8_t *)ch;
+ byte_count = 0;
+
+ /* try one byte */
+ if (utf8[0] <= 0x7F)
+ byte_count = 1;
+
+ /* try two bytes */
+ else if ((++ch)
+ && (utf8[0] >= 0xC2) && (utf8[0] <= 0xDF)
+ && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF))
+ byte_count = 2;
+
+ /* try three bytes */
+ else if ((++ch)
+ && (utf8[0] == 0xE0)
+ && (utf8[1] >= 0xA0) && (utf8[1] <= 0xBF)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
+ byte_count = 3;
+
+ else if (
+ (utf8[0] >= 0xE1) && (utf8[0] <= 0xEC)
+ && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
+ byte_count = 3;
+
+ else if (
+ (utf8[0] == 0xED)
+ && (utf8[1] >= 0x80) && (utf8[1] <= 0x9F)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
+ byte_count = 3;
+
+ else if (
+ (utf8[0] >= 0xEE) && (utf8[0] <= 0xEF)
+ && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
+ byte_count = 3;
+
+ /* try four bytes */
+ else if ((++ch)
+ && (utf8[0] == 0xF0)
+ && (utf8[1] >= 0x90) && (utf8[1] <= 0xBF)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
+ && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
+ byte_count = 4;
+
+ else if (
+ (utf8[0] >= 0xF1) && (utf8[0] <= 0xF3)
+ && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
+ && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
+ byte_count = 4;
+
+ else if (
+ (utf8[0] == 0xF4)
+ && (utf8[1] >= 0x80) && (utf8[1] <= 0x8F)
+ && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
+ && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
+ byte_count = 4;
+
+ if (byte_count) {
+ __utf8_to_utf16_handlers[byte_count](wch,utf8);
+ wch = &wch[byte_count >> 3];
+ wch++;
+ } else {
+ return NT_STATUS_ILLEGAL_CHARACTER;
+ }
+ }
+
+ *wch++ = 0;
+ }
- *bytes_written = 0;
+ *warrv = 0;
+ *bytes_written = sizeof(wchar16_t) * (wch - buffer);
return NT_STATUS_SUCCESS;
}