Remove unused Utf8ToWideCharParser (#16392)

I randomly came across this class, that I didn't even remember we had. We don't use this class at the moment and won't need it any time soon. Its current implementation is also fairly questionable. While `til::u16state` isn't "perfect", it's vastly better than this.
2025-12-19 18:11:39 -05:00 · 2023-11-30 15:52:39 +01:00
parent 0c4751ba30
commit 130c9fbd76
10 changed files with 0 additions and 1005 deletions
--- a/doc/ORGANIZATION.md
+++ b/doc/ORGANIZATION.md
@@ -125,8 +125,6 @@
 * Private calls into the Windows Window Manager to perform privileged actions related to the console process (working to eliminate) or for High DPI stuff (also working to eliminate)
 	* `Userprivapi.cpp`
 	* `Windowdpiapi.cpp`
 * New UTF8 state machine in progress to improve Bash (and other apps) support for UTF-8 in console
 	* `Utf8ToWideCharParser.cpp`
 * Window resizing/layout/management/window messaging loops and all that other stuff that has us interact with Windows to create a visual display surface and control the user interaction entry point
 	* `Window.cpp`
 	* `Windowproc.cpp`
--- a/src/host/host-common.vcxitems
+++ b/src/host/host-common.vcxitems
@@ -46,7 +46,6 @@
    <ClCompile Include="..\telemetry.cpp" />
    <ClCompile Include="..\tracing.cpp" />
    <ClCompile Include="..\utils.cpp" />
    <ClCompile Include="..\utf8ToWideCharParser.cpp" />
    <ClCompile Include="..\VtApiRoutines.cpp" />
    <ClCompile Include="..\VtInputThread.cpp" />
    <ClCompile Include="..\VtIo.cpp" />
@@ -100,7 +99,6 @@
    <ClInclude Include="..\telemetry.hpp" />
    <ClInclude Include="..\tracing.hpp" />
    <ClInclude Include="..\utils.hpp" />
    <ClInclude Include="..\utf8ToWideCharParser.hpp" />
    <ClInclude Include="..\VtApiRoutines.h" />
    <ClInclude Include="..\VtInputThread.hpp" />
    <ClInclude Include="..\VtIo.hpp" />
--- a/src/host/lib/hostlib.vcxproj.filters
+++ b/src/host/lib/hostlib.vcxproj.filters
@@ -111,9 +111,6 @@
    <ClCompile Include="..\conimeinfo.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="..\utf8ToWideCharParser.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="..\ntprivapi.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
@@ -266,9 +263,6 @@
    <ClInclude Include="..\outputStream.hpp">
      <Filter>Header Files</Filter>
    </ClInclude>
    <ClInclude Include="..\utf8ToWideCharParser.hpp">
      <Filter>Header Files</Filter>
    </ClInclude>
    <ClInclude Include="..\ApiRoutines.h">
      <Filter>Header Files</Filter>
    </ClInclude>
--- a/src/host/sources.inc
+++ b/src/host/sources.inc
@@ -84,7 +84,6 @@ SOURCES = \
    ..\writeData.cpp \
    ..\renderData.cpp \
    ..\renderFontDefaults.cpp \
    ..\utf8ToWideCharParser.cpp \
    ..\conareainfo.cpp \
    ..\conimeinfo.cpp \
    ..\ConsoleArguments.cpp \
--- a/src/host/ut_host/Host.UnitTests.vcxproj
+++ b/src/host/ut_host/Host.UnitTests.vcxproj
@@ -28,7 +28,6 @@
    <ClCompile Include="TextBufferTests.cpp" />
    <ClCompile Include="TitleTests.cpp" />
    <ClCompile Include="UtilsTests.cpp" />
    <ClCompile Include="Utf8ToWideCharParserTests.cpp" />
    <ClCompile Include="InputBufferTests.cpp" />
    <ClCompile Include="ViewportTests.cpp" />
    <ClCompile Include="VtIoTests.cpp" />
--- a/src/host/ut_host/Host.UnitTests.vcxproj.filters
+++ b/src/host/ut_host/Host.UnitTests.vcxproj.filters
@@ -39,9 +39,6 @@
    <ClCompile Include="..\precomp.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="Utf8ToWideCharParserTests.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="InitTests.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
--- a/src/host/ut_host/Utf8ToWideCharParserTests.cpp
+++ b/src/host/ut_host/Utf8ToWideCharParserTests.cpp
@@ -1,405 +0,0 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 #include "precomp.h"
 #include "WexTestClass.h"
 #include "../../inc/consoletaeftemplates.hpp"
 #include "utf8ToWideCharParser.hpp"
 #define IsBitSet WI_IsFlagSet
 using namespace WEX::Common;
 using namespace WEX::Logging;
 using namespace WEX::TestExecution;
 using namespace std;
 class Utf8ToWideCharParserTests
 {
    static const unsigned int utf8CodePage = 65001;
    static const unsigned int USACodePage = 1252;
    TEST_CLASS(Utf8ToWideCharParserTests);
    TEST_METHOD(ConvertsAsciiTest)
    {
        Log::Comment(L"Testing that ASCII chars are correctly converted to wide chars");
        auto parser = Utf8ToWideCharParser{ utf8CodePage };
        // ascii "hello"
        const unsigned char hello[5] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f };
        const unsigned char wideHello[10] = { 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00 };
        unsigned int count = 5;
        unsigned int consumed = 0;
        unsigned int generated = 0;
        unique_ptr<wchar_t[]> output{ nullptr };
        VERIFY_SUCCEEDED(parser.Parse(hello, count, consumed, output, generated));
        VERIFY_ARE_EQUAL(consumed, (unsigned int)5);
        VERIFY_ARE_EQUAL(generated, (unsigned int)5);
        VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
        auto pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
        for (auto i = 0; i < ARRAYSIZE(wideHello); ++i)
        {
            VERIFY_ARE_EQUAL(wideHello[i], pReturnedBytes[i]);
        }
    }
    TEST_METHOD(ConvertSimpleUtf8Test)
    {
        Log::Comment(L"Testing that a simple UTF8 sequence can be converted");
        auto parser = Utf8ToWideCharParser{ utf8CodePage };
        // U+3059, U+3057 (hiragana sushi)
        const unsigned char sushi[6] = { 0xe3, 0x81, 0x99, 0xe3, 0x81, 0x97 };
        const unsigned char wideSushi[4] = { 0x59, 0x30, 0x57, 0x30 };
        unsigned int count = 6;
        unsigned int consumed = 0;
        unsigned int generated = 0;
        unique_ptr<wchar_t[]> output{ nullptr };
        VERIFY_SUCCEEDED(parser.Parse(sushi, count, consumed, output, generated));
        VERIFY_ARE_EQUAL(consumed, (unsigned int)6);
        VERIFY_ARE_EQUAL(generated, (unsigned int)2);
        VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
        auto pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
        for (auto i = 0; i < ARRAYSIZE(wideSushi); ++i)
        {
            VERIFY_ARE_EQUAL(wideSushi[i], pReturnedBytes[i]);
        }
    }
    TEST_METHOD(WaitsForAdditionalInputAfterPartialSequenceTest)
    {
        Log::Comment(L"Testing that nothing is returned when parsing a partial sequence until the sequence is complete");
        // U+3057 (hiragana shi)
        unsigned char shi[3] = { 0xe3, 0x81, 0x97 };
        unsigned char wideShi[2] = { 0x57, 0x30 };
        auto parser = Utf8ToWideCharParser{ utf8CodePage };
        unsigned int count = 1;
        unsigned int consumed = 0;
        unsigned int generated = 0;
        unique_ptr<wchar_t[]> output{ nullptr };
        for (auto i = 0; i < 2; ++i)
        {
            VERIFY_SUCCEEDED(parser.Parse(shi + i, count, consumed, output, generated));
            VERIFY_ARE_EQUAL(consumed, (unsigned int)1);
            VERIFY_ARE_EQUAL(generated, (unsigned int)0);
            VERIFY_ARE_EQUAL(output.get(), nullptr);
            count = 1;
        }
        VERIFY_SUCCEEDED(parser.Parse(shi + 2, count, consumed, output, generated));
        VERIFY_ARE_EQUAL(consumed, (unsigned int)1);
        VERIFY_ARE_EQUAL(generated, (unsigned int)1);
        VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
        auto pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
        for (auto i = 0; i < ARRAYSIZE(wideShi); ++i)
        {
            VERIFY_ARE_EQUAL(wideShi[i], pReturnedBytes[i]);
        }
    }
    TEST_METHOD(ReturnsInitialPartOfSequenceThatEndsWithPartialTest)
    {
        Log::Comment(L"Testing that a valid portion of a sequence is returned when it ends with a partial sequence");
        // U+3059, U+3057 (hiragana sushi)
        const unsigned char sushi[6] = { 0xe3, 0x81, 0x99, 0xe3, 0x81, 0x97 };
        const unsigned char wideSushi[4] = { 0x59, 0x30, 0x57, 0x30 };
        unsigned int count = 4;
        unsigned int consumed = 0;
        unsigned int generated = 0;
        unique_ptr<wchar_t[]> output{ nullptr };
        auto parser = Utf8ToWideCharParser{ utf8CodePage };
        VERIFY_SUCCEEDED(parser.Parse(sushi, count, consumed, output, generated));
        // check that we got the first wide char back
        VERIFY_ARE_EQUAL(consumed, (unsigned int)4);
        VERIFY_ARE_EQUAL(generated, (unsigned int)1);
        VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
        auto pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
        for (auto i = 0; i < 2; ++i)
        {
            VERIFY_ARE_EQUAL(wideSushi[i], pReturnedBytes[i]);
        }
        // add byte 2 of 3 to parser
        count = 1;
        consumed = 0;
        generated = 0;
        output.reset(nullptr);
        VERIFY_SUCCEEDED(parser.Parse(sushi + 4, count, consumed, output, generated));
        VERIFY_ARE_EQUAL(consumed, (unsigned int)1);
        VERIFY_ARE_EQUAL(generated, (unsigned int)0);
        VERIFY_ARE_EQUAL(output.get(), nullptr);
        // add last byte
        count = 1;
        consumed = 0;
        generated = 0;
        output.reset(nullptr);
        VERIFY_SUCCEEDED(parser.Parse(sushi + 5, count, consumed, output, generated));
        VERIFY_ARE_EQUAL(consumed, (unsigned int)1);
        VERIFY_ARE_EQUAL(generated, (unsigned int)1);
        VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
        pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
        for (auto i = 0; i < 2; ++i)
        {
            VERIFY_ARE_EQUAL(wideSushi[i + 2], pReturnedBytes[i]);
        }
    }
    TEST_METHOD(MergesMultiplePartialSequencesTest)
    {
        Log::Comment(L"Testing that partial sequences sent individually will be merged together");
        // clang-format off
        // (hiragana doomo arigatoo)
        const unsigned char doomoArigatoo[24] = {
            0xe3, 0x81, 0xa9, // U+3069
            0xe3, 0x81, 0x86, // U+3046
            0xe3, 0x82, 0x82, // U+3082
            0xe3, 0x81, 0x82, // U+3042
            0xe3, 0x82, 0x8a, // U+308A
            0xe3, 0x81, 0x8c, // U+304C
            0xe3, 0x81, 0xa8, // U+3068
            0xe3, 0x81, 0x86  // U+3046
        };
        const unsigned char wideDoomoArigatoo[16] = {
            0x69, 0x30,
            0x46, 0x30,
            0x82, 0x30,
            0x42, 0x30,
            0x8a, 0x30,
            0x4c, 0x30,
            0x68, 0x30,
            0x46, 0x30
        };
        // clang-format on
        // send first 4 bytes
        unsigned int count = 4;
        unsigned int consumed = 0;
        unsigned int generated = 0;
        unique_ptr<wchar_t[]> output{ nullptr };
        auto parser = Utf8ToWideCharParser{ utf8CodePage };
        VERIFY_SUCCEEDED(parser.Parse(doomoArigatoo, count, consumed, output, generated));
        VERIFY_ARE_EQUAL(consumed, (unsigned int)4);
        VERIFY_ARE_EQUAL(generated, (unsigned int)1);
        VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
        auto pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
        for (auto i = 0; i < 2; ++i)
        {
            VERIFY_ARE_EQUAL(wideDoomoArigatoo[i], pReturnedBytes[i]);
        }
        // send next 16 bytes
        count = 16;
        consumed = 0;
        generated = 0;
        output.reset(nullptr);
        VERIFY_SUCCEEDED(parser.Parse(doomoArigatoo + 4, count, consumed, output, generated));
        VERIFY_ARE_EQUAL(consumed, (unsigned int)16);
        VERIFY_ARE_EQUAL(generated, (unsigned int)5);
        VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
        pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
        for (auto i = 0; i < 10; ++i)
        {
            VERIFY_ARE_EQUAL(wideDoomoArigatoo[i + 2], pReturnedBytes[i]);
        }
        // send last 4 bytes
        count = 4;
        consumed = 0;
        generated = 0;
        output.reset(nullptr);
        VERIFY_SUCCEEDED(parser.Parse(doomoArigatoo + 20, count, consumed, output, generated));
        VERIFY_ARE_EQUAL(consumed, (unsigned int)4);
        VERIFY_ARE_EQUAL(generated, (unsigned int)2);
        VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
        pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
        for (auto i = 0; i < 4; ++i)
        {
            VERIFY_ARE_EQUAL(wideDoomoArigatoo[i + 12], pReturnedBytes[i]);
        }
    }
    TEST_METHOD(RemovesInvalidSequencesTest)
    {
        Log::Comment(L"Testing that invalid sequences are removed and don't stop the parsing of the rest");
        // clang-format off
        // hiragana sushi with junk between japanese characters
        const unsigned char sushi[9] = {
            0xe3, 0x81, 0x99, // U+3059
            0x80, 0x81, 0x82, // junk continuation bytes
            0xe3, 0x81, 0x97  // U+3057
        };
        // clang-format on
        const unsigned char wideSushi[4] = { 0x59, 0x30, 0x57, 0x30 };
        unsigned int count = 9;
        unsigned int consumed = 0;
        unsigned int generated = 0;
        unique_ptr<wchar_t[]> output{ nullptr };
        auto parser = Utf8ToWideCharParser{ utf8CodePage };
        VERIFY_SUCCEEDED(parser.Parse(sushi, count, consumed, output, generated));
        VERIFY_ARE_EQUAL(consumed, (unsigned int)9);
        VERIFY_ARE_EQUAL(generated, (unsigned int)2);
        VERIFY_ARE_NOT_EQUAL(output.get(), nullptr);
        auto pReturnedBytes = reinterpret_cast<unsigned char*>(output.get());
        for (auto i = 0; i < ARRAYSIZE(wideSushi); ++i)
        {
            VERIFY_ARE_EQUAL(wideSushi[i], pReturnedBytes[i]);
        }
    }
    TEST_METHOD(NonMinimalFormTest)
    {
        Log::Comment(L"Testing that non-minimal forms of a character are tolerated don't stop the rest");
        // clang-format off
        // Test data
        const unsigned char data[] = {
            0x60, 0x12, 0x08, 0x7f, // single byte points
            0xc0, 0x80, // U+0000 as a 2-byte sequence (non-minimal)
            0x41, 0x48, 0x06, 0x55, // more single byte points
            0xe0, 0x80, 0x80, // U+0000 as a 3-byte sequence (non-minimal)
            0x18, 0x77, 0x40, 0x31, // more single byte points
            0xf0, 0x80, 0x80, 0x80, // U+0000 as a 4-byte sequence (non-minimal)
            0x59, 0x1f, 0x68, 0x20 // more single byte points
        };
        // Expected conversion
        const wchar_t wideData[] = {
            0x0060, 0x0012, 0x0008, 0x007f,
            0xfffd, 0xfffd, // The number of replacements per invalid sequence is not intended to be load-bearing
            0x0041, 0x0048, 0x0006, 0x0055,
            0xfffd, 0xfffd, // It is just representative of what it looked like when fixing this for GH#3380
            0x0018, 0x0077, 0x0040, 0x0031,
            0xfffd, 0xfffd, 0xfffd, // Change if necessary when completing GH#3378
            0x0059, 0x001f, 0x0068, 0x0020
        };
        // clang-format on
        const auto count = gsl::narrow_cast<unsigned int>(ARRAYSIZE(data));
        const auto wideCount = gsl::narrow_cast<unsigned int>(ARRAYSIZE(wideData));
        unsigned int consumed = 0;
        unsigned int generated = 0;
        unique_ptr<wchar_t[]> output{ nullptr };
        auto parser = Utf8ToWideCharParser{ utf8CodePage };
        VERIFY_SUCCEEDED(parser.Parse(data, count, consumed, output, generated));
        VERIFY_ARE_EQUAL(count, consumed);
        VERIFY_ARE_EQUAL(wideCount, generated);
        VERIFY_IS_NOT_NULL(output.get());
        const auto expected = WEX::Common::String(wideData, wideCount);
        const auto actual = WEX::Common::String(output.get(), generated);
        VERIFY_ARE_EQUAL(expected, actual);
    }
    TEST_METHOD(PartialBytesAreDroppedOnCodePageChangeTest)
    {
        Log::Comment(L"Testing that a saved partial sequence is cleared when the codepage changes");
        auto parser = Utf8ToWideCharParser{ utf8CodePage };
        // 2 bytes of a 4 byte sequence
        const unsigned int inputSize = 2;
        const unsigned char partialSequence[inputSize] = { 0xF0, 0x80 };
        auto count = inputSize;
        unsigned int consumed = 0;
        unsigned int generated = 0;
        unique_ptr<wchar_t[]> output{ nullptr };
        VERIFY_SUCCEEDED(parser.Parse(partialSequence, count, consumed, output, generated));
        VERIFY_ARE_EQUAL(parser._currentState, Utf8ToWideCharParser::_State::BeginPartialParse);
        VERIFY_ARE_EQUAL(parser._bytesStored, inputSize);
        // set the codepage to the same one it currently is, ensure
        // that nothing changes
        parser.SetCodePage(utf8CodePage);
        VERIFY_ARE_EQUAL(parser._currentState, Utf8ToWideCharParser::_State::BeginPartialParse);
        VERIFY_ARE_EQUAL(parser._bytesStored, inputSize);
        // change to a different codepage, ensure parser is reset
        parser.SetCodePage(USACodePage);
        VERIFY_ARE_EQUAL(parser._currentState, Utf8ToWideCharParser::_State::Ready);
        VERIFY_ARE_EQUAL(parser._bytesStored, (unsigned int)0);
    }
    TEST_METHOD(_IsLeadByteTest)
    {
        Log::Comment(L"Testing that _IsLeadByte properly differentiates correct from incorrect sequences");
        auto parser = Utf8ToWideCharParser{ utf8CodePage };
        VERIFY_IS_TRUE(parser._IsLeadByte(0xC0)); // 2 byte sequence
        VERIFY_IS_TRUE(parser._IsLeadByte(0xE0)); // 3 byte sequence
        VERIFY_IS_TRUE(parser._IsLeadByte(0xF0)); // 4 byte sequence
        VERIFY_IS_FALSE(parser._IsLeadByte(0x00)); // ASCII char NUL
        VERIFY_IS_FALSE(parser._IsLeadByte(0x80)); // continuation byte
        VERIFY_IS_FALSE(parser._IsLeadByte(0x83)); // continuation byte
        VERIFY_IS_FALSE(parser._IsLeadByte(0x7E)); // ASCII char '~'
        VERIFY_IS_FALSE(parser._IsLeadByte(0x21)); // ASCII char '!'
        VERIFY_IS_FALSE(parser._IsLeadByte(0xF8)); // invalid 5 byte sequence
        VERIFY_IS_FALSE(parser._IsLeadByte(0xFC)); // invalid 6 byte sequence
        VERIFY_IS_FALSE(parser._IsLeadByte(0xFE)); // invalid 7 byte sequence
        VERIFY_IS_FALSE(parser._IsLeadByte(0xFF)); // all 1's
    }
    TEST_METHOD(_IsContinuationByteTest)
    {
        Log::Comment(L"Testing that _IsContinuationByte properly differentiates correct from incorrect sequences");
        auto parser = Utf8ToWideCharParser{ utf8CodePage };
        for (BYTE i = 0x00; i < 0xFF; ++i)
        {
            if (IsBitSet(i, 0x80) && !IsBitSet(i, 0x40))
            {
                VERIFY_IS_TRUE(parser._IsContinuationByte(i), NoThrowString().Format(L"Byte is 0x%02x", i));
            }
            else
            {
                VERIFY_IS_FALSE(parser._IsContinuationByte(i), NoThrowString().Format(L"Byte is 0x%02x", i));
            }
        }
        VERIFY_IS_FALSE(parser._IsContinuationByte(0xFF));
    }
    TEST_METHOD(_IsAsciiByteTest)
    {
        Log::Comment(L"Testing that _IsAsciiByte properly differentiates correct from incorrect sequences");
        auto parser = Utf8ToWideCharParser{ utf8CodePage };
        for (BYTE i = 0x00; i < 0x80; ++i)
        {
            VERIFY_IS_TRUE(parser._IsAsciiByte(i), NoThrowString().Format(L"Byte is 0x%02x", i));
        }
        for (BYTE i = 0xFF; i > 0x7F; --i)
        {
            VERIFY_IS_FALSE(parser._IsAsciiByte(i), NoThrowString().Format(L"Byte is 0x%02x", i));
        }
    }
    TEST_METHOD(_Utf8SequenceSizeTest)
    {
        Log::Comment(L"Testing that _Utf8SequenceSize correctly counts the number of MSB 1's");
        auto parser = Utf8ToWideCharParser{ utf8CodePage };
        VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0x00), (unsigned int)0);
        VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0x80), (unsigned int)1);
        VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xC2), (unsigned int)2);
        VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xE3), (unsigned int)3);
        VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xF0), (unsigned int)4);
        VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xF3), (unsigned int)4);
        VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xF8), (unsigned int)5);
        VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xFC), (unsigned int)6);
        VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xFD), (unsigned int)6);
        VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xFE), (unsigned int)7);
        VERIFY_ARE_EQUAL(parser._Utf8SequenceSize(0xFF), (unsigned int)8);
    }
 };
--- a/src/host/ut_host/sources
+++ b/src/host/ut_host/sources
@@ -27,7 +27,6 @@ SOURCES = \
    TextBufferTests.cpp \
    ClipboardTests.cpp \
    SelectionTests.cpp \
    Utf8ToWideCharParserTests.cpp \
    OutputCellIteratorTests.cpp \
    InitTests.cpp \
    TitleTests.cpp \
--- a/src/host/utf8ToWideCharParser.cpp
+++ b/src/host/utf8ToWideCharParser.cpp
@@ -1,520 +0,0 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 #include "precomp.h"
 #include "utf8ToWideCharParser.hpp"
 #include <unicode.hpp>
 #ifndef WIL_ENABLE_EXCEPTIONS
 #error WIL exception helpers must be enabled
 #endif
 #define IsBitSet WI_IsFlagSet
 const byte NonAsciiBytePrefix = 0x80;
 const byte ContinuationByteMask = 0xC0;
 const byte ContinuationBytePrefix = 0x80;
 const byte MostSignificantBitMask = 0x80;
 // Routine Description:
 // - Constructs an instance of the parser.
 // Arguments:
 // - codePage - Starting code page to interpret input with.
 // Return Value:
 // - A new instance of the parser.
 Utf8ToWideCharParser::Utf8ToWideCharParser(const unsigned int codePage) :
    _currentCodePage{ codePage },
    _bytesStored{ 0 },
    _currentState{ _State::Ready },
    _convertedWideChars{ nullptr }
 {
    std::fill_n(_utf8CodePointPieces, _UTF8_BYTE_SEQUENCE_MAX, 0ui8);
 }
 // Routine Description:
 // - Set the code page that input sequences will correspond to. Clears
 // any saved partial multi-byte sequences if the code page changes
 // from the code page the partial sequence is associated with.
 // Arguments:
 // - codePage - the code page to set to.
 // Return Value:
 // - <none>
 void Utf8ToWideCharParser::SetCodePage(const unsigned int codePage)
 {
    if (_currentCodePage != codePage)
    {
        _currentCodePage = codePage;
        // we can't be making any assumptions about the partial
        // sequence we were storing now that the codepage has changed
        _bytesStored = 0;
        _currentState = _State::Ready;
    }
 }
 // Routine Description:
 // - Parses the input multi-byte sequence.
 // Arguments:
 // - pBytes - The byte sequence to parse.
 // - cchBuffer - The amount of bytes in pBytes. This will contain the
 // number of wide chars contained by converted after this function is
 // run, or 0 if an error occurs (or if pBytes is 0).
 // - converted - a valid unique_ptr to store the parsed wide chars
 // in. On error this will contain nullptr instead of an array.
 // Return Value:
 // - <none>
 [[nodiscard]] HRESULT Utf8ToWideCharParser::Parse(_In_reads_(cchBuffer) const byte* const pBytes,
                                                  _In_ const unsigned int cchBuffer,
                                                  _Out_ unsigned int& cchConsumed,
                                                  _Inout_ std::unique_ptr<wchar_t[]>& converted,
                                                  _Out_ unsigned int& cchConverted)
 {
    cchConsumed = 0;
    cchConverted = 0;
    // we can't parse anything if we weren't given any data to parse
    if (cchBuffer == 0)
    {
        return S_OK;
    }
    // we shouldn't be parsing if the current codepage isn't UTF8
    if (_currentCodePage != CP_UTF8)
    {
        _currentState = _State::Error;
    }
    auto hr = S_OK;
    try
    {
        auto loop = true;
        unsigned int wideCharCount = 0;
        _convertedWideChars.reset(nullptr);
        while (loop)
        {
            switch (_currentState)
            {
            case _State::Ready:
                wideCharCount = _ParseFullRange(pBytes, cchBuffer);
                break;
            case _State::BeginPartialParse:
                wideCharCount = _InvolvedParse(pBytes, cchBuffer);
                break;
            case _State::Error:
                hr = E_FAIL;
                _Reset();
                wideCharCount = 0;
                loop = false;
                break;
            case _State::Finished:
                _currentState = _State::Ready;
                cchConsumed = cchBuffer;
                loop = false;
                break;
            case _State::AwaitingMoreBytes:
                _currentState = _State::BeginPartialParse;
                cchConsumed = cchBuffer;
                loop = false;
                break;
            default:
                _currentState = _State::Error;
                break;
            }
        }
        converted.swap(_convertedWideChars);
        cchConverted = wideCharCount;
    }
    catch (...)
    {
        _Reset();
        hr = wil::ResultFromCaughtException();
    }
    return hr;
 }
 // Routine Description:
 // - Determines if ch is a UTF8 lead byte. See _Utf8SequenceSize() for a
 // description of how a lead byte is specified.
 // Arguments:
 // - ch - The byte to test.
 // Return Value:
 // - True if ch is a lead byte, false otherwise.
 bool Utf8ToWideCharParser::_IsLeadByte(_In_ byte ch)
 {
    auto sequenceSize = _Utf8SequenceSize(ch);
    return !_IsContinuationByte(ch) &&
           !_IsAsciiByte(ch) &&
           sequenceSize > 1 &&
           sequenceSize <= _UTF8_BYTE_SEQUENCE_MAX;
 }
 // Routine Description:
 // - Determines if ch is a UTF8 continuation byte. A continuation byte
 // takes the form 10xx xxxx, so we need to check that the two most
 // significant bits are a 1 followed by a 0.
 // Arguments:
 // - ch - The byte to test
 // Return Value:
 // - True if ch is a continuation byte, false otherwise.
 bool Utf8ToWideCharParser::_IsContinuationByte(_In_ byte ch)
 {
    return (ch & ContinuationByteMask) == ContinuationBytePrefix;
 }
 // Routine Description:
 // - Determines if ch is an ASCII compatible UTF8 byte. A byte is
 // ASCII compatible if the most significant bit is a 0.
 // Arguments:
 // - ch - The byte to test.
 // Return Value:
 // - True if ch is an ASCII compatible byte, false otherwise.
 bool Utf8ToWideCharParser::_IsAsciiByte(_In_ byte ch)
 {
    return !IsBitSet(ch, NonAsciiBytePrefix);
 }
 // Routine Description:
 // - Determines if the sequence starting at pLeadByte is a valid UTF8
 // multi-byte sequence. Note that a single ASCII byte does not count
 // as a valid MULTI-byte sequence.
 // Arguments:
 // - pLeadByte - The start of a possible sequence.
 // - cb - The amount of remaining chars in the array that
 // pLeadByte points to.
 // Return Value:
 // - true if the sequence starting at pLeadByte is a multi-byte
 // sequence and uses all of the remaining chars, false otherwise.
 bool Utf8ToWideCharParser::_IsValidMultiByteSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb)
 {
    if (!_IsLeadByte(*pLeadByte))
    {
        return false;
    }
    const auto sequenceSize = _Utf8SequenceSize(*pLeadByte);
    if (sequenceSize > cb)
    {
        return false;
    }
    // i starts at 1 so that we skip the lead byte
    for (unsigned int i = 1; i < sequenceSize; ++i)
    {
        const auto ch = *(pLeadByte + i);
        if (!_IsContinuationByte(ch))
        {
            return false;
        }
    }
    return true;
 }
 // Routine Description:
 // - Checks if the sequence starting at pLeadByte is a portion of a
 // single valid multi-byte sequence. A new sequence must not be
 // started within the range provided in order for it to be considered
 // a valid partial sequence.
 // Arguments:
 // - pLeadByte - The start of the possible partial sequence.
 // - cb - The amount of remaining chars in the array that
 // pLeadByte points to.
 // Return Value:
 // - true if the sequence is a single partial multi-byte sequence,
 // false otherwise.
 bool Utf8ToWideCharParser::_IsPartialMultiByteSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb)
 {
    if (!_IsLeadByte(*pLeadByte))
    {
        return false;
    }
    const auto sequenceSize = _Utf8SequenceSize(*pLeadByte);
    if (sequenceSize <= cb)
    {
        return false;
    }
    // i starts at 1 so that we skip the lead byte
    for (unsigned int i = 1; i < cb; ++i)
    {
        const auto ch = *(pLeadByte + i);
        if (!_IsContinuationByte(ch))
        {
            return false;
        }
    }
    return true;
 }
 // Routine Description:
 // - Determines the number of bytes in the UTF8 multi-byte sequence.
 // Does not perform any verification that ch is a valid lead byte. A
 // lead byte indicates how many bytes are in a sequence by repeating a
 // 1 for each byte in the sequence, starting with the most significant
 // bit, then a 0 directly after. Ex:
 // - 110x xxxx = a two byte sequence
 // - 1110 xxxx = a three byte sequence
 //
 // Note that a byte that has a pattern 10xx xxxx is a continuation
 // byte and will be reported as a sequence of one by this function.
 //
 // A sequence is currently a maximum of four bytes but this function
 // will just count the number of consecutive 1 bits (starting with the
 // most significant bit) so if the byte is malformed (ex. 1111 110x) a
 // number larger than the maximum utf8 byte sequence may be
 // returned. It is the responsibility of the calling function to check
 // this (and the continuation byte scenario) because we don't do any
 // verification here.
 // Arguments:
 // - ch - the lead byte of a UTF8 multi-byte sequence.
 // Return Value:
 // - The number of bytes (including the lead byte) that ch indicates
 // are in the sequence.
 unsigned int Utf8ToWideCharParser::_Utf8SequenceSize(_In_ byte ch)
 {
    unsigned int msbOnes = 0;
    while (IsBitSet(ch, MostSignificantBitMask))
    {
        ++msbOnes;
        ch <<= 1;
    }
    return msbOnes;
 }
 // Routine Description:
 // - Attempts to parse pInputChars by themselves in wide chars,
 // without using any saved partial byte sequences. On success,
 // _convertedWideChars will contain the converted wide char sequence
 // and _currentState will be set to _State::Finished. On failure,
 // _currentState will be set to either _State::Error or
 // _State::BeginPartialParse.
 // Arguments:
 // - pInputChars - The byte sequence to convert to wide chars.
 // - cb - The amount of bytes in pInputChars.
 // Return Value:
 // - The amount of wide chars that are stored in _convertedWideChars,
 // or 0 if pInputChars cannot be successfully converted.
 unsigned int Utf8ToWideCharParser::_ParseFullRange(_In_reads_(cb) const byte* const pInputChars, const unsigned int cb)
 {
    auto bufferSize = MultiByteToWideChar(_currentCodePage,
                                          MB_ERR_INVALID_CHARS,
                                          reinterpret_cast<LPCCH>(pInputChars),
                                          cb,
                                          nullptr,
                                          0);
    if (bufferSize == 0)
    {
        auto err = GetLastError();
        LOG_WIN32(err);
        if (err == ERROR_NO_UNICODE_TRANSLATION)
        {
            _currentState = _State::BeginPartialParse;
        }
        else
        {
            _currentState = _State::Error;
        }
    }
    else
    {
        _convertedWideChars = std::make_unique<wchar_t[]>(bufferSize);
        bufferSize = MultiByteToWideChar(_currentCodePage,
                                         0,
                                         reinterpret_cast<LPCCH>(pInputChars),
                                         cb,
                                         _convertedWideChars.get(),
                                         bufferSize);
        if (bufferSize == 0)
        {
            LOG_LAST_ERROR();
            _currentState = _State::Error;
        }
        else
        {
            _currentState = _State::Finished;
        }
    }
    return bufferSize;
 }
 // Routine Description:
 // - Attempts to parse pInputChars in a more complex manner, taking
 // into account any previously saved partial byte sequences while
 // removing any invalid byte sequences. Will also save a partial byte
 // sequence from the end of the sequence if necessary. If the sequence
 // can be successfully parsed, _currentState will be set to
 // _State::Finished. If more bytes are necessary to form a wide char,
 // then _currentState will be set to
 // _State::AwaitingMoreBytes. Otherwise, _currentState will be set to
 // _State::Error.
 // Arguments:
 // - pInputChars - The byte sequence to convert to wide chars.
 // - cb - The amount of bytes in pInputChars.
 // Return Value:
 // - The amount of wide chars that are stored in _convertedWideChars,
 // or 0 if pInputChars cannot be successfully converted or if the
 // parser requires additional bytes before returning a valid wide
 // char.
 unsigned int Utf8ToWideCharParser::_InvolvedParse(_In_reads_(cb) const byte* const pInputChars, const unsigned int cb)
 {
    // Do safe math to add up the count and error if it won't fit.
    unsigned int count;
    const auto hr = UIntAdd(cb, _bytesStored, &count);
    if (FAILED(hr))
    {
        LOG_HR(hr);
        _currentState = _State::Error;
        return 0;
    }
    // Allocate space and copy.
    auto combinedInputBytes = std::make_unique<byte[]>(count);
    std::copy(_utf8CodePointPieces, _utf8CodePointPieces + _bytesStored, combinedInputBytes.get());
    std::copy(pInputChars, pInputChars + cb, combinedInputBytes.get() + _bytesStored);
    _bytesStored = 0;
    auto validSequence = _RemoveInvalidSequences(combinedInputBytes.get(), count);
    // the input may have only been a partial sequence so we need to
    // check that there are actually any bytes that we can convert
    // right now
    if (validSequence.second == 0 && _bytesStored > 0)
    {
        _currentState = _State::AwaitingMoreBytes;
        return 0;
    }
    // By this point, all obviously invalid sequences have been removed.
    // But non-minimal forms of sequences might still exist.
    // MB2WC will fail non-minimal forms with MB_ERR_INVALID_CHARS at this point.
    // So we call with flags = 0 such that non-minimal forms get the U+FFFD
    // replacement character treatment.
    // This issue and related concerns are fully captured in future work item GH#3378
    // for future cleanup and reconciliation.
    // The original issue introducing this was GH#3320.
    auto bufferSize = MultiByteToWideChar(_currentCodePage,
                                          0,
                                          reinterpret_cast<LPCCH>(validSequence.first.get()),
                                          validSequence.second,
                                          nullptr,
                                          0);
    if (bufferSize == 0)
    {
        LOG_LAST_ERROR();
        _currentState = _State::Error;
    }
    else
    {
        _convertedWideChars = std::make_unique<wchar_t[]>(bufferSize);
        bufferSize = MultiByteToWideChar(_currentCodePage,
                                         0,
                                         reinterpret_cast<LPCCH>(validSequence.first.get()),
                                         validSequence.second,
                                         _convertedWideChars.get(),
                                         bufferSize);
        if (bufferSize == 0)
        {
            LOG_LAST_ERROR();
            _currentState = _State::Error;
        }
        else if (_bytesStored > 0)
        {
            _currentState = _State::AwaitingMoreBytes;
        }
        else
        {
            _currentState = _State::Finished;
        }
    }
    return bufferSize;
 }
 // Routine Description:
 // - Reads pInputChars byte by byte, removing any invalid UTF8
 // multi-byte sequences.
 // Arguments:
 // - pInputChars - The byte sequence to fix.
 // - cb - The amount of bytes in pInputChars.
 // Return Value:
 // - A std::pair containing the corrected byte sequence and the number
 // of bytes in the sequence.
 std::pair<std::unique_ptr<byte[]>, unsigned int> Utf8ToWideCharParser::_RemoveInvalidSequences(_In_reads_(cb) const byte* const pInputChars, const unsigned int cb)
 {
    auto validSequence = std::make_unique<byte[]>(cb);
    unsigned int validSequenceLocation = 0; // index into validSequence
    unsigned int currentByteInput = 0; // index into pInputChars
    while (currentByteInput < cb)
    {
        if (_IsAsciiByte(pInputChars[currentByteInput]))
        {
            validSequence[validSequenceLocation] = pInputChars[currentByteInput];
            ++validSequenceLocation;
            ++currentByteInput;
        }
        else if (_IsContinuationByte(pInputChars[currentByteInput]))
        {
            while (currentByteInput < cb && _IsContinuationByte(pInputChars[currentByteInput]))
            {
                ++currentByteInput;
            }
        }
        else if (_IsLeadByte(pInputChars[currentByteInput]))
        {
            if (_IsValidMultiByteSequence(&pInputChars[currentByteInput], cb - currentByteInput))
            {
                const auto sequenceSize = _Utf8SequenceSize(pInputChars[currentByteInput]);
                // min is to guard against static analysis possible buffer overflow
                const auto limit = std::min(sequenceSize, cb - currentByteInput);
                for (unsigned int i = 0; i < limit; ++i)
                {
                    validSequence[validSequenceLocation] = pInputChars[currentByteInput];
                    ++validSequenceLocation;
                    ++currentByteInput;
                }
            }
            else if (_IsPartialMultiByteSequence(&pInputChars[currentByteInput], cb - currentByteInput))
            {
                _StorePartialSequence(&pInputChars[currentByteInput], cb - currentByteInput);
                break;
            }
            else
            {
                ++currentByteInput;
                while (currentByteInput < cb && _IsContinuationByte(pInputChars[currentByteInput]))
                {
                    ++currentByteInput;
                }
            }
        }
        else
        {
            // invalid byte, skip it.
            ++currentByteInput;
        }
    }
    return std::make_pair<std::unique_ptr<byte[]>, unsigned int>(std::move(validSequence), std::move(validSequenceLocation));
 }
 // Routine Description:
 // - Stores a partial byte sequence for later use. Will overwrite any
 // previously saved sequence. Will only store bytes up to the limit
 // Utf8ToWideCharParser::_UTF8_BYTE_SEQUENCE_MAX.
 // Arguments:
 // - pLeadByte - The beginning of the sequence to save.
 // - cb - The amount of bytes to save.
 // Return Value:
 // - <none>
 void Utf8ToWideCharParser::_StorePartialSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb)
 {
    const auto maxLength = std::min(cb, _UTF8_BYTE_SEQUENCE_MAX);
    std::copy(pLeadByte, pLeadByte + maxLength, _utf8CodePointPieces);
    _bytesStored = maxLength;
 }
 // Routine Description:
 // - Resets the state of the parser to that of a newly initialized
 // instance. _currentCodePage is not affected.
 // Arguments:
 // - <none>
 // Return Value:
 // - <none>
 void Utf8ToWideCharParser::_Reset()
 {
    _currentState = _State::Ready;
    _bytesStored = 0;
    _convertedWideChars.reset(nullptr);
 }
--- a/src/host/utf8ToWideCharParser.hpp
+++ b/src/host/utf8ToWideCharParser.hpp
@@ -1,64 +0,0 @@
 /*++
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
 Module Name:
 - utf8ToWideCharParser.hpp
 Abstract:
 - This transforms a multi-byte character sequence into wide chars
 - It will attempt to work around invalid byte sequences
 - Partial byte sequences are supported
 Author(s):
 - Austin Diviness (AustDi) 16-August-2016
 --*/
 #pragma once
 class Utf8ToWideCharParser final
 {
 public:
    Utf8ToWideCharParser(const unsigned int codePage);
    void SetCodePage(const unsigned int codePage);
    [[nodiscard]] HRESULT Parse(_In_reads_(cchBuffer) const byte* const pBytes,
                                _In_ const unsigned int cchBuffer,
                                _Out_ unsigned int& cchConsumed,
                                _Inout_ std::unique_ptr<wchar_t[]>& converted,
                                _Out_ unsigned int& cchConverted);
 private:
    enum class _State
    {
        Ready, // ready for input, no partially parsed code points
        Error, // error in parsing given bytes
        BeginPartialParse, // not a clean byte sequence, needs involved parsing
        AwaitingMoreBytes, // have a partial sequence saved, waiting for the rest of it
        Finished // ready to return a wide char sequence
    };
    bool _IsLeadByte(_In_ byte ch);
    bool _IsContinuationByte(_In_ byte ch);
    bool _IsAsciiByte(_In_ byte ch);
    bool _IsValidMultiByteSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb);
    bool _IsPartialMultiByteSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb);
    unsigned int _Utf8SequenceSize(_In_ byte ch);
    unsigned int _ParseFullRange(_In_reads_(cb) const byte* const _InputChars, const unsigned int cb);
    unsigned int _InvolvedParse(_In_reads_(cb) const byte* const pInputChars, const unsigned int cb);
    std::pair<std::unique_ptr<byte[]>, unsigned int> _RemoveInvalidSequences(_In_reads_(cb) const byte* const pInputChars,
                                                                             const unsigned int cb);
    void _StorePartialSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb);
    void _Reset();
    static const unsigned int _UTF8_BYTE_SEQUENCE_MAX = 4;
    byte _utf8CodePointPieces[_UTF8_BYTE_SEQUENCE_MAX];
    unsigned int _bytesStored; // bytes stored in utf8CodePointPieces
    unsigned int _currentCodePage;
    std::unique_ptr<wchar_t[]> _convertedWideChars;
    _State _currentState;
 #ifdef UNIT_TESTING
    friend class Utf8ToWideCharParserTests;
 #endif
 };