mirror of
https://github.com/microsoft/terminal.git
synced 2025-12-19 18:11:39 -05:00
Remove unused Utf8ToWideCharParser (#16392)
I randomly came across this class, that I didn't even remember we had. We don't use this class at the moment and won't need it any time soon. Its current implementation is also fairly questionable. While `til::u16state` isn't "perfect", it's vastly better than this.
This commit is contained in:
@@ -125,8 +125,6 @@
|
|||||||
* Private calls into the Windows Window Manager to perform privileged actions related to the console process (working to eliminate) or for High DPI stuff (also working to eliminate)
|
* Private calls into the Windows Window Manager to perform privileged actions related to the console process (working to eliminate) or for High DPI stuff (also working to eliminate)
|
||||||
* `Userprivapi.cpp`
|
* `Userprivapi.cpp`
|
||||||
* `Windowdpiapi.cpp`
|
* `Windowdpiapi.cpp`
|
||||||
* New UTF8 state machine in progress to improve Bash (and other apps) support for UTF-8 in console
|
|
||||||
* `Utf8ToWideCharParser.cpp`
|
|
||||||
* Window resizing/layout/management/window messaging loops and all that other stuff that has us interact with Windows to create a visual display surface and control the user interaction entry point
|
* Window resizing/layout/management/window messaging loops and all that other stuff that has us interact with Windows to create a visual display surface and control the user interaction entry point
|
||||||
* `Window.cpp`
|
* `Window.cpp`
|
||||||
* `Windowproc.cpp`
|
* `Windowproc.cpp`
|
||||||
|
|||||||
@@ -46,7 +46,6 @@
|
|||||||
<ClCompile Include="..\telemetry.cpp" />
|
<ClCompile Include="..\telemetry.cpp" />
|
||||||
<ClCompile Include="..\tracing.cpp" />
|
<ClCompile Include="..\tracing.cpp" />
|
||||||
<ClCompile Include="..\utils.cpp" />
|
<ClCompile Include="..\utils.cpp" />
|
||||||
<ClCompile Include="..\utf8ToWideCharParser.cpp" />
|
|
||||||
<ClCompile Include="..\VtApiRoutines.cpp" />
|
<ClCompile Include="..\VtApiRoutines.cpp" />
|
||||||
<ClCompile Include="..\VtInputThread.cpp" />
|
<ClCompile Include="..\VtInputThread.cpp" />
|
||||||
<ClCompile Include="..\VtIo.cpp" />
|
<ClCompile Include="..\VtIo.cpp" />
|
||||||
@@ -100,7 +99,6 @@
|
|||||||
<ClInclude Include="..\telemetry.hpp" />
|
<ClInclude Include="..\telemetry.hpp" />
|
||||||
<ClInclude Include="..\tracing.hpp" />
|
<ClInclude Include="..\tracing.hpp" />
|
||||||
<ClInclude Include="..\utils.hpp" />
|
<ClInclude Include="..\utils.hpp" />
|
||||||
<ClInclude Include="..\utf8ToWideCharParser.hpp" />
|
|
||||||
<ClInclude Include="..\VtApiRoutines.h" />
|
<ClInclude Include="..\VtApiRoutines.h" />
|
||||||
<ClInclude Include="..\VtInputThread.hpp" />
|
<ClInclude Include="..\VtInputThread.hpp" />
|
||||||
<ClInclude Include="..\VtIo.hpp" />
|
<ClInclude Include="..\VtIo.hpp" />
|
||||||
|
|||||||
@@ -111,9 +111,6 @@
|
|||||||
<ClCompile Include="..\conimeinfo.cpp">
|
<ClCompile Include="..\conimeinfo.cpp">
|
||||||
<Filter>Source Files</Filter>
|
<Filter>Source Files</Filter>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<ClCompile Include="..\utf8ToWideCharParser.cpp">
|
|
||||||
<Filter>Source Files</Filter>
|
|
||||||
</ClCompile>
|
|
||||||
<ClCompile Include="..\ntprivapi.cpp">
|
<ClCompile Include="..\ntprivapi.cpp">
|
||||||
<Filter>Source Files</Filter>
|
<Filter>Source Files</Filter>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -266,9 +263,6 @@
|
|||||||
<ClInclude Include="..\outputStream.hpp">
|
<ClInclude Include="..\outputStream.hpp">
|
||||||
<Filter>Header Files</Filter>
|
<Filter>Header Files</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
<ClInclude Include="..\utf8ToWideCharParser.hpp">
|
|
||||||
<Filter>Header Files</Filter>
|
|
||||||
</ClInclude>
|
|
||||||
<ClInclude Include="..\ApiRoutines.h">
|
<ClInclude Include="..\ApiRoutines.h">
|
||||||
<Filter>Header Files</Filter>
|
<Filter>Header Files</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
|
|||||||
@@ -84,7 +84,6 @@ SOURCES = \
|
|||||||
..\writeData.cpp \
|
..\writeData.cpp \
|
||||||
..\renderData.cpp \
|
..\renderData.cpp \
|
||||||
..\renderFontDefaults.cpp \
|
..\renderFontDefaults.cpp \
|
||||||
..\utf8ToWideCharParser.cpp \
|
|
||||||
..\conareainfo.cpp \
|
..\conareainfo.cpp \
|
||||||
..\conimeinfo.cpp \
|
..\conimeinfo.cpp \
|
||||||
..\ConsoleArguments.cpp \
|
..\ConsoleArguments.cpp \
|
||||||
|
|||||||
@@ -28,7 +28,6 @@
|
|||||||
<ClCompile Include="TextBufferTests.cpp" />
|
<ClCompile Include="TextBufferTests.cpp" />
|
||||||
<ClCompile Include="TitleTests.cpp" />
|
<ClCompile Include="TitleTests.cpp" />
|
||||||
<ClCompile Include="UtilsTests.cpp" />
|
<ClCompile Include="UtilsTests.cpp" />
|
||||||
<ClCompile Include="Utf8ToWideCharParserTests.cpp" />
|
|
||||||
<ClCompile Include="InputBufferTests.cpp" />
|
<ClCompile Include="InputBufferTests.cpp" />
|
||||||
<ClCompile Include="ViewportTests.cpp" />
|
<ClCompile Include="ViewportTests.cpp" />
|
||||||
<ClCompile Include="VtIoTests.cpp" />
|
<ClCompile Include="VtIoTests.cpp" />
|
||||||
|
|||||||
@@ -39,9 +39,6 @@
|
|||||||
<ClCompile Include="..\precomp.cpp">
|
<ClCompile Include="..\precomp.cpp">
|
||||||
<Filter>Source Files</Filter>
|
<Filter>Source Files</Filter>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<ClCompile Include="Utf8ToWideCharParserTests.cpp">
|
|
||||||
<Filter>Source Files</Filter>
|
|
||||||
</ClCompile>
|
|
||||||
<ClCompile Include="InitTests.cpp">
|
<ClCompile Include="InitTests.cpp">
|
||||||
<Filter>Source Files</Filter>
|
<Filter>Source Files</Filter>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
|
|||||||
@@ -1,405 +0,0 @@
|
|||||||
// Copyright (c) Microsoft Corporation.
|
|
||||||
// Licensed under the MIT license.
|
|
||||||
|
|
||||||
#include "precomp.h"
|
|
||||||
#include "WexTestClass.h"
|
|
||||||
#include "../../inc/consoletaeftemplates.hpp"
|
|
||||||
|
|
||||||
#include "utf8ToWideCharParser.hpp"
|
|
||||||
|
|
||||||
#define IsBitSet WI_IsFlagSet
|
|
||||||
|
|
||||||
using namespace WEX::Common;
|
|
||||||
using namespace WEX::Logging;
|
|
||||||
using namespace WEX::TestExecution;
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class Utf8ToWideCharParserTests
|
|
||||||
{
|
|
||||||
static const unsigned int utf8CodePage = 65001;
|
|
||||||
static const unsigned int USACodePage = 1252;
|
|
||||||
|
|
||||||
TEST_CLASS(Utf8ToWideCharParserTests);
|
|
||||||
|
|
||||||
TEST_METHOD(ConvertsAsciiTest)
|
|
||||||
{
|
|
||||||
Log::Comment(L"Testing that ASCII chars are correctly converted to wide chars");
|
|
||||||
auto parser = Utf8ToWideCharParser{ utf8CodePage };
|
|
||||||
// ascii "hello"
|
|
||||||
const unsigned char hello[5] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f };
|
|
||||||
const unsigned char wideHello[10] = { 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00 };
|
|
||||||
unsigned int count = 5;
|
|
||||||
unsigned int consumed = 0;
|
|
||||||
unsigned int generated = 0;
|
|
||||||
unique_ptr<wchar_t[]> output{ nullptr };
|
|
||||||
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(hello, count, consumed, output, generated));
|
|
||||||
VERIFY_ARE_EQUAL(consumed, (unsigned int)5);
|
|
||||||
VERIFY_ARE_EQUAL(generated, (unsigned int)5);
|
|
||||||
VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
|
|
||||||
|
|
||||||
auto pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
|
|
||||||
for (auto i = 0; i < ARRAYSIZE(wideHello); ++i)
|
|
||||||
{
|
|
||||||
VERIFY_ARE_EQUAL(wideHello[i], pReturnedBytes[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_METHOD(ConvertSimpleUtf8Test)
|
|
||||||
{
|
|
||||||
Log::Comment(L"Testing that a simple UTF8 sequence can be converted");
|
|
||||||
auto parser = Utf8ToWideCharParser{ utf8CodePage };
|
|
||||||
// U+3059, U+3057 (hiragana sushi)
|
|
||||||
const unsigned char sushi[6] = { 0xe3, 0x81, 0x99, 0xe3, 0x81, 0x97 };
|
|
||||||
const unsigned char wideSushi[4] = { 0x59, 0x30, 0x57, 0x30 };
|
|
||||||
unsigned int count = 6;
|
|
||||||
unsigned int consumed = 0;
|
|
||||||
unsigned int generated = 0;
|
|
||||||
unique_ptr<wchar_t[]> output{ nullptr };
|
|
||||||
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(sushi, count, consumed, output, generated));
|
|
||||||
VERIFY_ARE_EQUAL(consumed, (unsigned int)6);
|
|
||||||
VERIFY_ARE_EQUAL(generated, (unsigned int)2);
|
|
||||||
VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
|
|
||||||
|
|
||||||
auto pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
|
|
||||||
for (auto i = 0; i < ARRAYSIZE(wideSushi); ++i)
|
|
||||||
{
|
|
||||||
VERIFY_ARE_EQUAL(wideSushi[i], pReturnedBytes[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_METHOD(WaitsForAdditionalInputAfterPartialSequenceTest)
|
|
||||||
{
|
|
||||||
Log::Comment(L"Testing that nothing is returned when parsing a partial sequence until the sequence is complete");
|
|
||||||
// U+3057 (hiragana shi)
|
|
||||||
unsigned char shi[3] = { 0xe3, 0x81, 0x97 };
|
|
||||||
unsigned char wideShi[2] = { 0x57, 0x30 };
|
|
||||||
auto parser = Utf8ToWideCharParser{ utf8CodePage };
|
|
||||||
unsigned int count = 1;
|
|
||||||
unsigned int consumed = 0;
|
|
||||||
unsigned int generated = 0;
|
|
||||||
unique_ptr<wchar_t[]> output{ nullptr };
|
|
||||||
|
|
||||||
for (auto i = 0; i < 2; ++i)
|
|
||||||
{
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(shi + i, count, consumed, output, generated));
|
|
||||||
VERIFY_ARE_EQUAL(consumed, (unsigned int)1);
|
|
||||||
VERIFY_ARE_EQUAL(generated, (unsigned int)0);
|
|
||||||
VERIFY_ARE_EQUAL(output.get(), nullptr);
|
|
||||||
count = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(shi + 2, count, consumed, output, generated));
|
|
||||||
VERIFY_ARE_EQUAL(consumed, (unsigned int)1);
|
|
||||||
VERIFY_ARE_EQUAL(generated, (unsigned int)1);
|
|
||||||
VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
|
|
||||||
|
|
||||||
auto pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
|
|
||||||
for (auto i = 0; i < ARRAYSIZE(wideShi); ++i)
|
|
||||||
{
|
|
||||||
VERIFY_ARE_EQUAL(wideShi[i], pReturnedBytes[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_METHOD(ReturnsInitialPartOfSequenceThatEndsWithPartialTest)
|
|
||||||
{
|
|
||||||
Log::Comment(L"Testing that a valid portion of a sequence is returned when it ends with a partial sequence");
|
|
||||||
// U+3059, U+3057 (hiragana sushi)
|
|
||||||
const unsigned char sushi[6] = { 0xe3, 0x81, 0x99, 0xe3, 0x81, 0x97 };
|
|
||||||
const unsigned char wideSushi[4] = { 0x59, 0x30, 0x57, 0x30 };
|
|
||||||
unsigned int count = 4;
|
|
||||||
unsigned int consumed = 0;
|
|
||||||
unsigned int generated = 0;
|
|
||||||
unique_ptr<wchar_t[]> output{ nullptr };
|
|
||||||
auto parser = Utf8ToWideCharParser{ utf8CodePage };
|
|
||||||
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(sushi, count, consumed, output, generated));
|
|
||||||
// check that we got the first wide char back
|
|
||||||
VERIFY_ARE_EQUAL(consumed, (unsigned int)4);
|
|
||||||
VERIFY_ARE_EQUAL(generated, (unsigned int)1);
|
|
||||||
VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
|
|
||||||
|
|
||||||
auto pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
|
|
||||||
for (auto i = 0; i < 2; ++i)
|
|
||||||
{
|
|
||||||
VERIFY_ARE_EQUAL(wideSushi[i], pReturnedBytes[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// add byte 2 of 3 to parser
|
|
||||||
count = 1;
|
|
||||||
consumed = 0;
|
|
||||||
generated = 0;
|
|
||||||
output.reset(nullptr);
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(sushi + 4, count, consumed, output, generated));
|
|
||||||
VERIFY_ARE_EQUAL(consumed, (unsigned int)1);
|
|
||||||
VERIFY_ARE_EQUAL(generated, (unsigned int)0);
|
|
||||||
VERIFY_ARE_EQUAL(output.get(), nullptr);
|
|
||||||
|
|
||||||
// add last byte
|
|
||||||
count = 1;
|
|
||||||
consumed = 0;
|
|
||||||
generated = 0;
|
|
||||||
output.reset(nullptr);
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(sushi + 5, count, consumed, output, generated));
|
|
||||||
VERIFY_ARE_EQUAL(consumed, (unsigned int)1);
|
|
||||||
VERIFY_ARE_EQUAL(generated, (unsigned int)1);
|
|
||||||
VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
|
|
||||||
|
|
||||||
pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
|
|
||||||
for (auto i = 0; i < 2; ++i)
|
|
||||||
{
|
|
||||||
VERIFY_ARE_EQUAL(wideSushi[i + 2], pReturnedBytes[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_METHOD(MergesMultiplePartialSequencesTest)
|
|
||||||
{
|
|
||||||
Log::Comment(L"Testing that partial sequences sent individually will be merged together");
|
|
||||||
|
|
||||||
// clang-format off
|
|
||||||
// (hiragana doomo arigatoo)
|
|
||||||
const unsigned char doomoArigatoo[24] = {
|
|
||||||
0xe3, 0x81, 0xa9, // U+3069
|
|
||||||
0xe3, 0x81, 0x86, // U+3046
|
|
||||||
0xe3, 0x82, 0x82, // U+3082
|
|
||||||
0xe3, 0x81, 0x82, // U+3042
|
|
||||||
0xe3, 0x82, 0x8a, // U+308A
|
|
||||||
0xe3, 0x81, 0x8c, // U+304C
|
|
||||||
0xe3, 0x81, 0xa8, // U+3068
|
|
||||||
0xe3, 0x81, 0x86 // U+3046
|
|
||||||
};
|
|
||||||
const unsigned char wideDoomoArigatoo[16] = {
|
|
||||||
0x69, 0x30,
|
|
||||||
0x46, 0x30,
|
|
||||||
0x82, 0x30,
|
|
||||||
0x42, 0x30,
|
|
||||||
0x8a, 0x30,
|
|
||||||
0x4c, 0x30,
|
|
||||||
0x68, 0x30,
|
|
||||||
0x46, 0x30
|
|
||||||
};
|
|
||||||
// clang-format on
|
|
||||||
|
|
||||||
// send first 4 bytes
|
|
||||||
unsigned int count = 4;
|
|
||||||
unsigned int consumed = 0;
|
|
||||||
unsigned int generated = 0;
|
|
||||||
unique_ptr<wchar_t[]> output{ nullptr };
|
|
||||||
auto parser = Utf8ToWideCharParser{ utf8CodePage };
|
|
||||||
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(doomoArigatoo, count, consumed, output, generated));
|
|
||||||
VERIFY_ARE_EQUAL(consumed, (unsigned int)4);
|
|
||||||
VERIFY_ARE_EQUAL(generated, (unsigned int)1);
|
|
||||||
VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
|
|
||||||
|
|
||||||
auto pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
|
|
||||||
for (auto i = 0; i < 2; ++i)
|
|
||||||
{
|
|
||||||
VERIFY_ARE_EQUAL(wideDoomoArigatoo[i], pReturnedBytes[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// send next 16 bytes
|
|
||||||
count = 16;
|
|
||||||
consumed = 0;
|
|
||||||
generated = 0;
|
|
||||||
output.reset(nullptr);
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(doomoArigatoo + 4, count, consumed, output, generated));
|
|
||||||
VERIFY_ARE_EQUAL(consumed, (unsigned int)16);
|
|
||||||
VERIFY_ARE_EQUAL(generated, (unsigned int)5);
|
|
||||||
VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
|
|
||||||
|
|
||||||
pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
|
|
||||||
for (auto i = 0; i < 10; ++i)
|
|
||||||
{
|
|
||||||
VERIFY_ARE_EQUAL(wideDoomoArigatoo[i + 2], pReturnedBytes[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// send last 4 bytes
|
|
||||||
count = 4;
|
|
||||||
consumed = 0;
|
|
||||||
generated = 0;
|
|
||||||
output.reset(nullptr);
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(doomoArigatoo + 20, count, consumed, output, generated));
|
|
||||||
VERIFY_ARE_EQUAL(consumed, (unsigned int)4);
|
|
||||||
VERIFY_ARE_EQUAL(generated, (unsigned int)2);
|
|
||||||
VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
|
|
||||||
|
|
||||||
pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
|
|
||||||
for (auto i = 0; i < 4; ++i)
|
|
||||||
{
|
|
||||||
VERIFY_ARE_EQUAL(wideDoomoArigatoo[i + 12], pReturnedBytes[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_METHOD(RemovesInvalidSequencesTest)
|
|
||||||
{
|
|
||||||
Log::Comment(L"Testing that invalid sequences are removed and don't stop the parsing of the rest");
|
|
||||||
|
|
||||||
// clang-format off
|
|
||||||
// hiragana sushi with junk between japanese characters
|
|
||||||
const unsigned char sushi[9] = {
|
|
||||||
0xe3, 0x81, 0x99, // U+3059
|
|
||||||
0x80, 0x81, 0x82, // junk continuation bytes
|
|
||||||
0xe3, 0x81, 0x97 // U+3057
|
|
||||||
};
|
|
||||||
// clang-format on
|
|
||||||
|
|
||||||
const unsigned char wideSushi[4] = { 0x59, 0x30, 0x57, 0x30 };
|
|
||||||
unsigned int count = 9;
|
|
||||||
unsigned int consumed = 0;
|
|
||||||
unsigned int generated = 0;
|
|
||||||
unique_ptr<wchar_t[]> output{ nullptr };
|
|
||||||
auto parser = Utf8ToWideCharParser{ utf8CodePage };
|
|
||||||
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(sushi, count, consumed, output, generated));
|
|
||||||
VERIFY_ARE_EQUAL(consumed, (unsigned int)9);
|
|
||||||
VERIFY_ARE_EQUAL(generated, (unsigned int)2);
|
|
||||||
VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
|
|
||||||
|
|
||||||
auto pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
|
|
||||||
for (auto i = 0; i < ARRAYSIZE(wideSushi); ++i)
|
|
||||||
{
|
|
||||||
VERIFY_ARE_EQUAL(wideSushi[i], pReturnedBytes[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_METHOD(NonMinimalFormTest)
|
|
||||||
{
|
|
||||||
Log::Comment(L"Testing that non-minimal forms of a character are tolerated don't stop the rest");
|
|
||||||
|
|
||||||
// clang-format off
|
|
||||||
|
|
||||||
// Test data
|
|
||||||
const unsigned char data[] = {
|
|
||||||
0x60, 0x12, 0x08, 0x7f, // single byte points
|
|
||||||
0xc0, 0x80, // U+0000 as a 2-byte sequence (non-minimal)
|
|
||||||
0x41, 0x48, 0x06, 0x55, // more single byte points
|
|
||||||
0xe0, 0x80, 0x80, // U+0000 as a 3-byte sequence (non-minimal)
|
|
||||||
0x18, 0x77, 0x40, 0x31, // more single byte points
|
|
||||||
0xf0, 0x80, 0x80, 0x80, // U+0000 as a 4-byte sequence (non-minimal)
|
|
||||||
0x59, 0x1f, 0x68, 0x20 // more single byte points
|
|
||||||
};
|
|
||||||
|
|
||||||
// Expected conversion
|
|
||||||
const wchar_t wideData[] = {
|
|
||||||
0x0060, 0x0012, 0x0008, 0x007f,
|
|
||||||
0xfffd, 0xfffd, // The number of replacements per invalid sequence is not intended to be load-bearing
|
|
||||||
0x0041, 0x0048, 0x0006, 0x0055,
|
|
||||||
0xfffd, 0xfffd, // It is just representative of what it looked like when fixing this for GH#3380
|
|
||||||
0x0018, 0x0077, 0x0040, 0x0031,
|
|
||||||
0xfffd, 0xfffd, 0xfffd, // Change if necessary when completing GH#3378
|
|
||||||
0x0059, 0x001f, 0x0068, 0x0020
|
|
||||||
};
|
|
||||||
|
|
||||||
// clang-format on
|
|
||||||
|
|
||||||
const auto count = gsl::narrow_cast<unsigned int>(ARRAYSIZE(data));
|
|
||||||
const auto wideCount = gsl::narrow_cast<unsigned int>(ARRAYSIZE(wideData));
|
|
||||||
unsigned int consumed = 0;
|
|
||||||
unsigned int generated = 0;
|
|
||||||
unique_ptr<wchar_t[]> output{ nullptr };
|
|
||||||
auto parser = Utf8ToWideCharParser{ utf8CodePage };
|
|
||||||
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(data, count, consumed, output, generated));
|
|
||||||
VERIFY_ARE_EQUAL(count, consumed);
|
|
||||||
VERIFY_ARE_EQUAL(wideCount, generated);
|
|
||||||
VERIFY_IS_NOT_NULL(output.get());
|
|
||||||
|
|
||||||
const auto expected = WEX::Common::String(wideData, wideCount);
|
|
||||||
const auto actual = WEX::Common::String(output.get(), generated);
|
|
||||||
VERIFY_ARE_EQUAL(expected, actual);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_METHOD(PartialBytesAreDroppedOnCodePageChangeTest)
|
|
||||||
{
|
|
||||||
Log::Comment(L"Testing that a saved partial sequence is cleared when the codepage changes");
|
|
||||||
auto parser = Utf8ToWideCharParser{ utf8CodePage };
|
|
||||||
// 2 bytes of a 4 byte sequence
|
|
||||||
const unsigned int inputSize = 2;
|
|
||||||
const unsigned char partialSequence[inputSize] = { 0xF0, 0x80 };
|
|
||||||
auto count = inputSize;
|
|
||||||
unsigned int consumed = 0;
|
|
||||||
unsigned int generated = 0;
|
|
||||||
unique_ptr<wchar_t[]> output{ nullptr };
|
|
||||||
VERIFY_SUCCEEDED(parser.Parse(partialSequence, count, consumed, output, generated));
|
|
||||||
VERIFY_ARE_EQUAL(parser._currentState, Utf8ToWideCharParser::_State::BeginPartialParse);
|
|
||||||
VERIFY_ARE_EQUAL(parser._bytesStored, inputSize);
|
|
||||||
// set the codepage to the same one it currently is, ensure
|
|
||||||
// that nothing changes
|
|
||||||
parser.SetCodePage(utf8CodePage);
|
|
||||||
VERIFY_ARE_EQUAL(parser._currentState, Utf8ToWideCharParser::_State::BeginPartialParse);
|
|
||||||
VERIFY_ARE_EQUAL(parser._bytesStored, inputSize);
|
|
||||||
// change to a different codepage, ensure parser is reset
|
|
||||||
parser.SetCodePage(USACodePage);
|
|
||||||
VERIFY_ARE_EQUAL(parser._currentState, Utf8ToWideCharParser::_State::Ready);
|
|
||||||
VERIFY_ARE_EQUAL(parser._bytesStored, (unsigned int)0);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_METHOD(_IsLeadByteTest)
|
|
||||||
{
|
|
||||||
Log::Comment(L"Testing that _IsLeadByte properly differentiates correct from incorrect sequences");
|
|
||||||
auto parser = Utf8ToWideCharParser{ utf8CodePage };
|
|
||||||
VERIFY_IS_TRUE(parser._IsLeadByte(0xC0)); // 2 byte sequence
|
|
||||||
VERIFY_IS_TRUE(parser._IsLeadByte(0xE0)); // 3 byte sequence
|
|
||||||
VERIFY_IS_TRUE(parser._IsLeadByte(0xF0)); // 4 byte sequence
|
|
||||||
VERIFY_IS_FALSE(parser._IsLeadByte(0x00)); // ASCII char NUL
|
|
||||||
VERIFY_IS_FALSE(parser._IsLeadByte(0x80)); // continuation byte
|
|
||||||
VERIFY_IS_FALSE(parser._IsLeadByte(0x83)); // continuation byte
|
|
||||||
VERIFY_IS_FALSE(parser._IsLeadByte(0x7E)); // ASCII char '~'
|
|
||||||
VERIFY_IS_FALSE(parser._IsLeadByte(0x21)); // ASCII char '!'
|
|
||||||
VERIFY_IS_FALSE(parser._IsLeadByte(0xF8)); // invalid 5 byte sequence
|
|
||||||
VERIFY_IS_FALSE(parser._IsLeadByte(0xFC)); // invalid 6 byte sequence
|
|
||||||
VERIFY_IS_FALSE(parser._IsLeadByte(0xFE)); // invalid 7 byte sequence
|
|
||||||
VERIFY_IS_FALSE(parser._IsLeadByte(0xFF)); // all 1's
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_METHOD(_IsContinuationByteTest)
|
|
||||||
{
|
|
||||||
Log::Comment(L"Testing that _IsContinuationByte properly differentiates correct from incorrect sequences");
|
|
||||||
auto parser = Utf8ToWideCharParser{ utf8CodePage };
|
|
||||||
for (BYTE i = 0x00; i < 0xFF; ++i)
|
|
||||||
{
|
|
||||||
if (IsBitSet(i, 0x80) && !IsBitSet(i, 0x40))
|
|
||||||
{
|
|
||||||
VERIFY_IS_TRUE(parser._IsContinuationByte(i), NoThrowString().Format(L"Byte is 0x%02x", i));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
VERIFY_IS_FALSE(parser._IsContinuationByte(i), NoThrowString().Format(L"Byte is 0x%02x", i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
VERIFY_IS_FALSE(parser._IsContinuationByte(0xFF));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_METHOD(_IsAsciiByteTest)
|
|
||||||
{
|
|
||||||
Log::Comment(L"Testing that _IsAsciiByte properly differentiates correct from incorrect sequences");
|
|
||||||
auto parser = Utf8ToWideCharParser{ utf8CodePage };
|
|
||||||
for (BYTE i = 0x00; i < 0x80; ++i)
|
|
||||||
{
|
|
||||||
VERIFY_IS_TRUE(parser._IsAsciiByte(i), NoThrowString().Format(L"Byte is 0x%02x", i));
|
|
||||||
}
|
|
||||||
for (BYTE i = 0xFF; i > 0x7F; --i)
|
|
||||||
{
|
|
||||||
VERIFY_IS_FALSE(parser._IsAsciiByte(i), NoThrowString().Format(L"Byte is 0x%02x", i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_METHOD(_Utf8SequenceSizeTest)
|
|
||||||
{
|
|
||||||
Log::Comment(L"Testing that _Utf8SequenceSize correctly counts the number of MSB 1's");
|
|
||||||
auto parser = Utf8ToWideCharParser{ utf8CodePage };
|
|
||||||
VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0x00), (unsigned int)0);
|
|
||||||
VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0x80), (unsigned int)1);
|
|
||||||
VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xC2), (unsigned int)2);
|
|
||||||
VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xE3), (unsigned int)3);
|
|
||||||
VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xF0), (unsigned int)4);
|
|
||||||
VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xF3), (unsigned int)4);
|
|
||||||
VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xF8), (unsigned int)5);
|
|
||||||
VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xFC), (unsigned int)6);
|
|
||||||
VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xFD), (unsigned int)6);
|
|
||||||
VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xFE), (unsigned int)7);
|
|
||||||
VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xFF), (unsigned int)8);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
@@ -27,7 +27,6 @@ SOURCES = \
|
|||||||
TextBufferTests.cpp \
|
TextBufferTests.cpp \
|
||||||
ClipboardTests.cpp \
|
ClipboardTests.cpp \
|
||||||
SelectionTests.cpp \
|
SelectionTests.cpp \
|
||||||
Utf8ToWideCharParserTests.cpp \
|
|
||||||
OutputCellIteratorTests.cpp \
|
OutputCellIteratorTests.cpp \
|
||||||
InitTests.cpp \
|
InitTests.cpp \
|
||||||
TitleTests.cpp \
|
TitleTests.cpp \
|
||||||
|
|||||||
@@ -1,520 +0,0 @@
|
|||||||
// Copyright (c) Microsoft Corporation.
|
|
||||||
// Licensed under the MIT license.
|
|
||||||
|
|
||||||
#include "precomp.h"
|
|
||||||
|
|
||||||
#include "utf8ToWideCharParser.hpp"
|
|
||||||
#include <unicode.hpp>
|
|
||||||
|
|
||||||
#ifndef WIL_ENABLE_EXCEPTIONS
|
|
||||||
#error WIL exception helpers must be enabled
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define IsBitSet WI_IsFlagSet
|
|
||||||
|
|
||||||
const byte NonAsciiBytePrefix = 0x80;
|
|
||||||
|
|
||||||
const byte ContinuationByteMask = 0xC0;
|
|
||||||
const byte ContinuationBytePrefix = 0x80;
|
|
||||||
|
|
||||||
const byte MostSignificantBitMask = 0x80;
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Constructs an instance of the parser.
|
|
||||||
// Arguments:
|
|
||||||
// - codePage - Starting code page to interpret input with.
|
|
||||||
// Return Value:
|
|
||||||
// - A new instance of the parser.
|
|
||||||
Utf8ToWideCharParser::Utf8ToWideCharParser(const unsigned int codePage) :
|
|
||||||
_currentCodePage{ codePage },
|
|
||||||
_bytesStored{ 0 },
|
|
||||||
_currentState{ _State::Ready },
|
|
||||||
_convertedWideChars{ nullptr }
|
|
||||||
{
|
|
||||||
std::fill_n(_utf8CodePointPieces, _UTF8_BYTE_SEQUENCE_MAX, 0ui8);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Set the code page that input sequences will correspond to. Clears
|
|
||||||
// any saved partial multi-byte sequences if the code page changes
|
|
||||||
// from the code page the partial sequence is associated with.
|
|
||||||
// Arguments:
|
|
||||||
// - codePage - the code page to set to.
|
|
||||||
// Return Value:
|
|
||||||
// - <none>
|
|
||||||
void Utf8ToWideCharParser::SetCodePage(const unsigned int codePage)
|
|
||||||
{
|
|
||||||
if (_currentCodePage != codePage)
|
|
||||||
{
|
|
||||||
_currentCodePage = codePage;
|
|
||||||
// we can't be making any assumptions about the partial
|
|
||||||
// sequence we were storing now that the codepage has changed
|
|
||||||
_bytesStored = 0;
|
|
||||||
_currentState = _State::Ready;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Parses the input multi-byte sequence.
|
|
||||||
// Arguments:
|
|
||||||
// - pBytes - The byte sequence to parse.
|
|
||||||
// - cchBuffer - The amount of bytes in pBytes. This will contain the
|
|
||||||
// number of wide chars contained by converted after this function is
|
|
||||||
// run, or 0 if an error occurs (or if pBytes is 0).
|
|
||||||
// - converted - a valid unique_ptr to store the parsed wide chars
|
|
||||||
// in. On error this will contain nullptr instead of an array.
|
|
||||||
// Return Value:
|
|
||||||
// - <none>
|
|
||||||
[[nodiscard]] HRESULT Utf8ToWideCharParser::Parse(_In_reads_(cchBuffer) const byte* const pBytes,
|
|
||||||
_In_ const unsigned int cchBuffer,
|
|
||||||
_Out_ unsigned int& cchConsumed,
|
|
||||||
_Inout_ std::unique_ptr<wchar_t[]>& converted,
|
|
||||||
_Out_ unsigned int& cchConverted)
|
|
||||||
{
|
|
||||||
cchConsumed = 0;
|
|
||||||
cchConverted = 0;
|
|
||||||
|
|
||||||
// we can't parse anything if we weren't given any data to parse
|
|
||||||
if (cchBuffer == 0)
|
|
||||||
{
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
// we shouldn't be parsing if the current codepage isn't UTF8
|
|
||||||
if (_currentCodePage != CP_UTF8)
|
|
||||||
{
|
|
||||||
_currentState = _State::Error;
|
|
||||||
}
|
|
||||||
auto hr = S_OK;
|
|
||||||
try
|
|
||||||
{
|
|
||||||
auto loop = true;
|
|
||||||
unsigned int wideCharCount = 0;
|
|
||||||
_convertedWideChars.reset(nullptr);
|
|
||||||
while (loop)
|
|
||||||
{
|
|
||||||
switch (_currentState)
|
|
||||||
{
|
|
||||||
case _State::Ready:
|
|
||||||
wideCharCount = _ParseFullRange(pBytes, cchBuffer);
|
|
||||||
break;
|
|
||||||
case _State::BeginPartialParse:
|
|
||||||
wideCharCount = _InvolvedParse(pBytes, cchBuffer);
|
|
||||||
break;
|
|
||||||
case _State::Error:
|
|
||||||
hr = E_FAIL;
|
|
||||||
_Reset();
|
|
||||||
wideCharCount = 0;
|
|
||||||
loop = false;
|
|
||||||
break;
|
|
||||||
case _State::Finished:
|
|
||||||
_currentState = _State::Ready;
|
|
||||||
cchConsumed = cchBuffer;
|
|
||||||
loop = false;
|
|
||||||
break;
|
|
||||||
case _State::AwaitingMoreBytes:
|
|
||||||
_currentState = _State::BeginPartialParse;
|
|
||||||
cchConsumed = cchBuffer;
|
|
||||||
loop = false;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
_currentState = _State::Error;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
converted.swap(_convertedWideChars);
|
|
||||||
cchConverted = wideCharCount;
|
|
||||||
}
|
|
||||||
catch (...)
|
|
||||||
{
|
|
||||||
_Reset();
|
|
||||||
hr = wil::ResultFromCaughtException();
|
|
||||||
}
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Determines if ch is a UTF8 lead byte. See _Utf8SequenceSize() for a
|
|
||||||
// description of how a lead byte is specified.
|
|
||||||
// Arguments:
|
|
||||||
// - ch - The byte to test.
|
|
||||||
// Return Value:
|
|
||||||
// - True if ch is a lead byte, false otherwise.
|
|
||||||
bool Utf8ToWideCharParser::_IsLeadByte(_In_ byte ch)
|
|
||||||
{
|
|
||||||
auto sequenceSize = _Utf8SequenceSize(ch);
|
|
||||||
return !_IsContinuationByte(ch) &&
|
|
||||||
!_IsAsciiByte(ch) &&
|
|
||||||
sequenceSize > 1 &&
|
|
||||||
sequenceSize <= _UTF8_BYTE_SEQUENCE_MAX;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Determines if ch is a UTF8 continuation byte. A continuation byte
|
|
||||||
// takes the form 10xx xxxx, so we need to check that the two most
|
|
||||||
// significant bits are a 1 followed by a 0.
|
|
||||||
// Arguments:
|
|
||||||
// - ch - The byte to test
|
|
||||||
// Return Value:
|
|
||||||
// - True if ch is a continuation byte, false otherwise.
|
|
||||||
bool Utf8ToWideCharParser::_IsContinuationByte(_In_ byte ch)
|
|
||||||
{
|
|
||||||
return (ch & ContinuationByteMask) == ContinuationBytePrefix;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Determines if ch is an ASCII compatible UTF8 byte. A byte is
|
|
||||||
// ASCII compatible if the most significant bit is a 0.
|
|
||||||
// Arguments:
|
|
||||||
// - ch - The byte to test.
|
|
||||||
// Return Value:
|
|
||||||
// - True if ch is an ASCII compatible byte, false otherwise.
|
|
||||||
bool Utf8ToWideCharParser::_IsAsciiByte(_In_ byte ch)
|
|
||||||
{
|
|
||||||
return !IsBitSet(ch, NonAsciiBytePrefix);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Determines if the sequence starting at pLeadByte is a valid UTF8
|
|
||||||
// multi-byte sequence. Note that a single ASCII byte does not count
|
|
||||||
// as a valid MULTI-byte sequence.
|
|
||||||
// Arguments:
|
|
||||||
// - pLeadByte - The start of a possible sequence.
|
|
||||||
// - cb - The amount of remaining chars in the array that
|
|
||||||
// pLeadByte points to.
|
|
||||||
// Return Value:
|
|
||||||
// - true if the sequence starting at pLeadByte is a multi-byte
|
|
||||||
// sequence and uses all of the remaining chars, false otherwise.
|
|
||||||
bool Utf8ToWideCharParser::_IsValidMultiByteSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb)
|
|
||||||
{
|
|
||||||
if (!_IsLeadByte(*pLeadByte))
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
const auto sequenceSize = _Utf8SequenceSize(*pLeadByte);
|
|
||||||
if (sequenceSize > cb)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// i starts at 1 so that we skip the lead byte
|
|
||||||
for (unsigned int i = 1; i < sequenceSize; ++i)
|
|
||||||
{
|
|
||||||
const auto ch = *(pLeadByte + i);
|
|
||||||
if (!_IsContinuationByte(ch))
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Checks if the sequence starting at pLeadByte is a portion of a
|
|
||||||
// single valid multi-byte sequence. A new sequence must not be
|
|
||||||
// started within the range provided in order for it to be considered
|
|
||||||
// a valid partial sequence.
|
|
||||||
// Arguments:
|
|
||||||
// - pLeadByte - The start of the possible partial sequence.
|
|
||||||
// - cb - The amount of remaining chars in the array that
|
|
||||||
// pLeadByte points to.
|
|
||||||
// Return Value:
|
|
||||||
// - true if the sequence is a single partial multi-byte sequence,
|
|
||||||
// false otherwise.
|
|
||||||
bool Utf8ToWideCharParser::_IsPartialMultiByteSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb)
|
|
||||||
{
|
|
||||||
if (!_IsLeadByte(*pLeadByte))
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
const auto sequenceSize = _Utf8SequenceSize(*pLeadByte);
|
|
||||||
if (sequenceSize <= cb)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// i starts at 1 so that we skip the lead byte
|
|
||||||
for (unsigned int i = 1; i < cb; ++i)
|
|
||||||
{
|
|
||||||
const auto ch = *(pLeadByte + i);
|
|
||||||
if (!_IsContinuationByte(ch))
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Determines the number of bytes in the UTF8 multi-byte sequence.
|
|
||||||
// Does not perform any verification that ch is a valid lead byte. A
|
|
||||||
// lead byte indicates how many bytes are in a sequence by repeating a
|
|
||||||
// 1 for each byte in the sequence, starting with the most significant
|
|
||||||
// bit, then a 0 directly after. Ex:
|
|
||||||
// - 110x xxxx = a two byte sequence
|
|
||||||
// - 1110 xxxx = a three byte sequence
|
|
||||||
//
|
|
||||||
// Note that a byte that has a pattern 10xx xxxx is a continuation
|
|
||||||
// byte and will be reported as a sequence of one by this function.
|
|
||||||
//
|
|
||||||
// A sequence is currently a maximum of four bytes but this function
|
|
||||||
// will just count the number of consecutive 1 bits (starting with the
|
|
||||||
// most significant bit) so if the byte is malformed (ex. 1111 110x) a
|
|
||||||
// number larger than the maximum utf8 byte sequence may be
|
|
||||||
// returned. It is the responsibility of the calling function to check
|
|
||||||
// this (and the continuation byte scenario) because we don't do any
|
|
||||||
// verification here.
|
|
||||||
// Arguments:
|
|
||||||
// - ch - the lead byte of a UTF8 multi-byte sequence.
|
|
||||||
// Return Value:
|
|
||||||
// - The number of bytes (including the lead byte) that ch indicates
|
|
||||||
// are in the sequence.
|
|
||||||
unsigned int Utf8ToWideCharParser::_Utf8SequenceSize(_In_ byte ch)
|
|
||||||
{
|
|
||||||
unsigned int msbOnes = 0;
|
|
||||||
while (IsBitSet(ch, MostSignificantBitMask))
|
|
||||||
{
|
|
||||||
++msbOnes;
|
|
||||||
ch <<= 1;
|
|
||||||
}
|
|
||||||
return msbOnes;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Attempts to parse pInputChars by themselves in wide chars,
|
|
||||||
// without using any saved partial byte sequences. On success,
|
|
||||||
// _convertedWideChars will contain the converted wide char sequence
|
|
||||||
// and _currentState will be set to _State::Finished. On failure,
|
|
||||||
// _currentState will be set to either _State::Error or
|
|
||||||
// _State::BeginPartialParse.
|
|
||||||
// Arguments:
|
|
||||||
// - pInputChars - The byte sequence to convert to wide chars.
|
|
||||||
// - cb - The amount of bytes in pInputChars.
|
|
||||||
// Return Value:
|
|
||||||
// - The amount of wide chars that are stored in _convertedWideChars,
|
|
||||||
// or 0 if pInputChars cannot be successfully converted.
|
|
||||||
unsigned int Utf8ToWideCharParser::_ParseFullRange(_In_reads_(cb) const byte* const pInputChars, const unsigned int cb)
|
|
||||||
{
|
|
||||||
auto bufferSize = MultiByteToWideChar(_currentCodePage,
|
|
||||||
MB_ERR_INVALID_CHARS,
|
|
||||||
reinterpret_cast<LPCCH>(pInputChars),
|
|
||||||
cb,
|
|
||||||
nullptr,
|
|
||||||
0);
|
|
||||||
if (bufferSize == 0)
|
|
||||||
{
|
|
||||||
auto err = GetLastError();
|
|
||||||
LOG_WIN32(err);
|
|
||||||
if (err == ERROR_NO_UNICODE_TRANSLATION)
|
|
||||||
{
|
|
||||||
_currentState = _State::BeginPartialParse;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
_currentState = _State::Error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
_convertedWideChars = std::make_unique<wchar_t[]>(bufferSize);
|
|
||||||
bufferSize = MultiByteToWideChar(_currentCodePage,
|
|
||||||
0,
|
|
||||||
reinterpret_cast<LPCCH>(pInputChars),
|
|
||||||
cb,
|
|
||||||
_convertedWideChars.get(),
|
|
||||||
bufferSize);
|
|
||||||
if (bufferSize == 0)
|
|
||||||
{
|
|
||||||
LOG_LAST_ERROR();
|
|
||||||
_currentState = _State::Error;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
_currentState = _State::Finished;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return bufferSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Attempts to parse pInputChars in a more complex manner, taking
|
|
||||||
// into account any previously saved partial byte sequences while
|
|
||||||
// removing any invalid byte sequences. Will also save a partial byte
|
|
||||||
// sequence from the end of the sequence if necessary. If the sequence
|
|
||||||
// can be successfully parsed, _currentState will be set to
|
|
||||||
// _State::Finished. If more bytes are necessary to form a wide char,
|
|
||||||
// then _currentState will be set to
|
|
||||||
// _State::AwaitingMoreBytes. Otherwise, _currentState will be set to
|
|
||||||
// _State::Error.
|
|
||||||
// Arguments:
|
|
||||||
// - pInputChars - The byte sequence to convert to wide chars.
|
|
||||||
// - cb - The amount of bytes in pInputChars.
|
|
||||||
// Return Value:
|
|
||||||
// - The amount of wide chars that are stored in _convertedWideChars,
|
|
||||||
// or 0 if pInputChars cannot be successfully converted or if the
|
|
||||||
// parser requires additional bytes before returning a valid wide
|
|
||||||
// char.
|
|
||||||
unsigned int Utf8ToWideCharParser::_InvolvedParse(_In_reads_(cb) const byte* const pInputChars, const unsigned int cb)
|
|
||||||
{
|
|
||||||
// Do safe math to add up the count and error if it won't fit.
|
|
||||||
unsigned int count;
|
|
||||||
const auto hr = UIntAdd(cb, _bytesStored, &count);
|
|
||||||
if (FAILED(hr))
|
|
||||||
{
|
|
||||||
LOG_HR(hr);
|
|
||||||
_currentState = _State::Error;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Allocate space and copy.
|
|
||||||
auto combinedInputBytes = std::make_unique<byte[]>(count);
|
|
||||||
std::copy(_utf8CodePointPieces, _utf8CodePointPieces + _bytesStored, combinedInputBytes.get());
|
|
||||||
std::copy(pInputChars, pInputChars + cb, combinedInputBytes.get() + _bytesStored);
|
|
||||||
_bytesStored = 0;
|
|
||||||
auto validSequence = _RemoveInvalidSequences(combinedInputBytes.get(), count);
|
|
||||||
// the input may have only been a partial sequence so we need to
|
|
||||||
// check that there are actually any bytes that we can convert
|
|
||||||
// right now
|
|
||||||
if (validSequence.second == 0 && _bytesStored > 0)
|
|
||||||
{
|
|
||||||
_currentState = _State::AwaitingMoreBytes;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// By this point, all obviously invalid sequences have been removed.
|
|
||||||
// But non-minimal forms of sequences might still exist.
|
|
||||||
// MB2WC will fail non-minimal forms with MB_ERR_INVALID_CHARS at this point.
|
|
||||||
// So we call with flags = 0 such that non-minimal forms get the U+FFFD
|
|
||||||
// replacement character treatment.
|
|
||||||
// This issue and related concerns are fully captured in future work item GH#3378
|
|
||||||
// for future cleanup and reconciliation.
|
|
||||||
// The original issue introducing this was GH#3320.
|
|
||||||
auto bufferSize = MultiByteToWideChar(_currentCodePage,
|
|
||||||
0,
|
|
||||||
reinterpret_cast<LPCCH>(validSequence.first.get()),
|
|
||||||
validSequence.second,
|
|
||||||
nullptr,
|
|
||||||
0);
|
|
||||||
if (bufferSize == 0)
|
|
||||||
{
|
|
||||||
LOG_LAST_ERROR();
|
|
||||||
_currentState = _State::Error;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
_convertedWideChars = std::make_unique<wchar_t[]>(bufferSize);
|
|
||||||
bufferSize = MultiByteToWideChar(_currentCodePage,
|
|
||||||
0,
|
|
||||||
reinterpret_cast<LPCCH>(validSequence.first.get()),
|
|
||||||
validSequence.second,
|
|
||||||
_convertedWideChars.get(),
|
|
||||||
bufferSize);
|
|
||||||
if (bufferSize == 0)
|
|
||||||
{
|
|
||||||
LOG_LAST_ERROR();
|
|
||||||
_currentState = _State::Error;
|
|
||||||
}
|
|
||||||
else if (_bytesStored > 0)
|
|
||||||
{
|
|
||||||
_currentState = _State::AwaitingMoreBytes;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
_currentState = _State::Finished;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return bufferSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Reads pInputChars byte by byte, removing any invalid UTF8
|
|
||||||
// multi-byte sequences.
|
|
||||||
// Arguments:
|
|
||||||
// - pInputChars - The byte sequence to fix.
|
|
||||||
// - cb - The amount of bytes in pInputChars.
|
|
||||||
// Return Value:
|
|
||||||
// - A std::pair containing the corrected byte sequence and the number
|
|
||||||
// of bytes in the sequence.
|
|
||||||
std::pair<std::unique_ptr<byte[]>, unsigned int> Utf8ToWideCharParser::_RemoveInvalidSequences(_In_reads_(cb) const byte* const pInputChars, const unsigned int cb)
|
|
||||||
{
|
|
||||||
auto validSequence = std::make_unique<byte[]>(cb);
|
|
||||||
unsigned int validSequenceLocation = 0; // index into validSequence
|
|
||||||
unsigned int currentByteInput = 0; // index into pInputChars
|
|
||||||
while (currentByteInput < cb)
|
|
||||||
{
|
|
||||||
if (_IsAsciiByte(pInputChars[currentByteInput]))
|
|
||||||
{
|
|
||||||
validSequence[validSequenceLocation] = pInputChars[currentByteInput];
|
|
||||||
++validSequenceLocation;
|
|
||||||
++currentByteInput;
|
|
||||||
}
|
|
||||||
else if (_IsContinuationByte(pInputChars[currentByteInput]))
|
|
||||||
{
|
|
||||||
while (currentByteInput < cb && _IsContinuationByte(pInputChars[currentByteInput]))
|
|
||||||
{
|
|
||||||
++currentByteInput;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (_IsLeadByte(pInputChars[currentByteInput]))
|
|
||||||
{
|
|
||||||
if (_IsValidMultiByteSequence(&pInputChars[currentByteInput], cb - currentByteInput))
|
|
||||||
{
|
|
||||||
const auto sequenceSize = _Utf8SequenceSize(pInputChars[currentByteInput]);
|
|
||||||
// min is to guard against static analysis possible buffer overflow
|
|
||||||
const auto limit = std::min(sequenceSize, cb - currentByteInput);
|
|
||||||
for (unsigned int i = 0; i < limit; ++i)
|
|
||||||
{
|
|
||||||
validSequence[validSequenceLocation] = pInputChars[currentByteInput];
|
|
||||||
++validSequenceLocation;
|
|
||||||
++currentByteInput;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (_IsPartialMultiByteSequence(&pInputChars[currentByteInput], cb - currentByteInput))
|
|
||||||
{
|
|
||||||
_StorePartialSequence(&pInputChars[currentByteInput], cb - currentByteInput);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
++currentByteInput;
|
|
||||||
while (currentByteInput < cb && _IsContinuationByte(pInputChars[currentByteInput]))
|
|
||||||
{
|
|
||||||
++currentByteInput;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// invalid byte, skip it.
|
|
||||||
++currentByteInput;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return std::make_pair<std::unique_ptr<byte[]>, unsigned int>(std::move(validSequence), std::move(validSequenceLocation));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Stores a partial byte sequence for later use. Will overwrite any
|
|
||||||
// previously saved sequence. Will only store bytes up to the limit
|
|
||||||
// Utf8ToWideCharParser::_UTF8_BYTE_SEQUENCE_MAX.
|
|
||||||
// Arguments:
|
|
||||||
// - pLeadByte - The beginning of the sequence to save.
|
|
||||||
// - cb - The amount of bytes to save.
|
|
||||||
// Return Value:
|
|
||||||
// - <none>
|
|
||||||
void Utf8ToWideCharParser::_StorePartialSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb)
|
|
||||||
{
|
|
||||||
const auto maxLength = std::min(cb, _UTF8_BYTE_SEQUENCE_MAX);
|
|
||||||
std::copy(pLeadByte, pLeadByte + maxLength, _utf8CodePointPieces);
|
|
||||||
_bytesStored = maxLength;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Routine Description:
|
|
||||||
// - Resets the state of the parser to that of a newly initialized
|
|
||||||
// instance. _currentCodePage is not affected.
|
|
||||||
// Arguments:
|
|
||||||
// - <none>
|
|
||||||
// Return Value:
|
|
||||||
// - <none>
|
|
||||||
void Utf8ToWideCharParser::_Reset()
|
|
||||||
{
|
|
||||||
_currentState = _State::Ready;
|
|
||||||
_bytesStored = 0;
|
|
||||||
_convertedWideChars.reset(nullptr);
|
|
||||||
}
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
/*++
|
|
||||||
Copyright (c) Microsoft Corporation
|
|
||||||
Licensed under the MIT license.
|
|
||||||
|
|
||||||
Module Name:
|
|
||||||
- utf8ToWideCharParser.hpp
|
|
||||||
|
|
||||||
Abstract:
|
|
||||||
- This transforms a multi-byte character sequence into wide chars
|
|
||||||
- It will attempt to work around invalid byte sequences
|
|
||||||
- Partial byte sequences are supported
|
|
||||||
|
|
||||||
Author(s):
|
|
||||||
- Austin Diviness (AustDi) 16-August-2016
|
|
||||||
--*/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
class Utf8ToWideCharParser final
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
Utf8ToWideCharParser(const unsigned int codePage);
|
|
||||||
void SetCodePage(const unsigned int codePage);
|
|
||||||
[[nodiscard]] HRESULT Parse(_In_reads_(cchBuffer) const byte* const pBytes,
|
|
||||||
_In_ const unsigned int cchBuffer,
|
|
||||||
_Out_ unsigned int& cchConsumed,
|
|
||||||
_Inout_ std::unique_ptr<wchar_t[]>& converted,
|
|
||||||
_Out_ unsigned int& cchConverted);
|
|
||||||
|
|
||||||
private:
|
|
||||||
enum class _State
|
|
||||||
{
|
|
||||||
Ready, // ready for input, no partially parsed code points
|
|
||||||
Error, // error in parsing given bytes
|
|
||||||
BeginPartialParse, // not a clean byte sequence, needs involved parsing
|
|
||||||
AwaitingMoreBytes, // have a partial sequence saved, waiting for the rest of it
|
|
||||||
Finished // ready to return a wide char sequence
|
|
||||||
};
|
|
||||||
|
|
||||||
bool _IsLeadByte(_In_ byte ch);
|
|
||||||
bool _IsContinuationByte(_In_ byte ch);
|
|
||||||
bool _IsAsciiByte(_In_ byte ch);
|
|
||||||
bool _IsValidMultiByteSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb);
|
|
||||||
bool _IsPartialMultiByteSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb);
|
|
||||||
unsigned int _Utf8SequenceSize(_In_ byte ch);
|
|
||||||
unsigned int _ParseFullRange(_In_reads_(cb) const byte* const _InputChars, const unsigned int cb);
|
|
||||||
unsigned int _InvolvedParse(_In_reads_(cb) const byte* const pInputChars, const unsigned int cb);
|
|
||||||
std::pair<std::unique_ptr<byte[]>, unsigned int> _RemoveInvalidSequences(_In_reads_(cb) const byte* const pInputChars,
|
|
||||||
const unsigned int cb);
|
|
||||||
void _StorePartialSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb);
|
|
||||||
void _Reset();
|
|
||||||
|
|
||||||
static const unsigned int _UTF8_BYTE_SEQUENCE_MAX = 4;
|
|
||||||
|
|
||||||
byte _utf8CodePointPieces[_UTF8_BYTE_SEQUENCE_MAX];
|
|
||||||
unsigned int _bytesStored; // bytes stored in utf8CodePointPieces
|
|
||||||
unsigned int _currentCodePage;
|
|
||||||
std::unique_ptr<wchar_t[]> _convertedWideChars;
|
|
||||||
_State _currentState;
|
|
||||||
|
|
||||||
#ifdef UNIT_TESTING
|
|
||||||
friend class Utf8ToWideCharParserTests;
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
Reference in New Issue
Block a user