microsoft · miniksa · Jul 16, 2019 · Jun 30, 2019 · Jul 6, 2019 · Jul 6, 2019
diff --git a/src/cascadia/TerminalConnection/ConhostConnection.cpp b/src/cascadia/TerminalConnection/ConhostConnection.cpp
@@ -15,6 +15,7 @@
 
 #include <conpty-universal.h>
 #include "../../types/inc/Utils.hpp"
+#include "../../types/inc/UTF8OutPipeReader.hpp"
 
 using namespace ::Microsoft::Console;
 
@@ -189,39 +190,36 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
 
     DWORD ConhostConnection::_OutputThread()
     {
-        const size_t bufferSize = 4096;
-        BYTE buffer[bufferSize];
-        DWORD dwRead;
+        static UTF8OutPipeReader pipeReader{ _outPipe };
+        std::string_view strView{};
+
+        // process the data of the output pipe in a loop
         while (true)
         {
-            dwRead = 0;
-            bool fSuccess = false;
-
-            fSuccess = !!ReadFile(_outPipe.get(), buffer, bufferSize, &dwRead, nullptr);
-            if (!fSuccess)
+            HRESULT result = pipeReader.Read(strView);
+            if (FAILED(result))
             {
                 if (_closing.load())
                 {
                     // This is okay, break out to kill the thread
                     return 0;
                 }
-                else
-                {
-                    _disconnectHandlers();
-                    return (DWORD)-1;
-                }
+
+                _disconnectHandlers();
+                return (DWORD)-1;
             }
-            if (dwRead == 0)
+            else if (strView.empty())
             {
-                continue;
+                return 0;
             }
+
             // Convert buffer to hstring
-            char* pchStr = (char*)(buffer);
-            std::string str{ pchStr, dwRead };
-            auto hstr = winrt::to_hstring(str);
+            auto hstr{ winrt::to_hstring(strView) };
 
             // Pass the output to our registered event handlers
             _outputHandlers(hstr);
         }
+
+        return 0;
     }
 }
diff --git a/src/types/UTF8OutPipeReader.cpp b/src/types/UTF8OutPipeReader.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "precomp.h"
+#include "inc/Utf8OutPipeReader.hpp"
+#include <type_traits>
+#include <utility>
+
+UTF8OutPipeReader::UTF8OutPipeReader(wil::unique_hfile& outPipe) :
+    _outPipe{ outPipe }
+{
+}
+
+[[nodiscard]] HRESULT UTF8OutPipeReader::Read(_Out_ std::string_view& strView)
+{
+    DWORD dwRead{};
+    bool fSuccess{};
+
+    // in case of early escaping
+    *_buffer = 0;
+    strView = reinterpret_cast<char*>(_buffer);
+
+    // copy UTF-8 code units that were remaining from the previously read chunk (if any)
+    if (_dwPartialsLen != 0)
+    {
+        std::move(_utf8Partials, _utf8Partials + _dwPartialsLen, _buffer);
+    }
+
+    // try to read data
+    fSuccess = !!ReadFile(_outPipe.get(), &_buffer[_dwPartialsLen], std::extent<decltype(_buffer)>::value - _dwPartialsLen, &dwRead, nullptr);
+
+    dwRead += _dwPartialsLen;
+    _dwPartialsLen = 0;
+
+    if (dwRead == 0) // quit if no data has been read and no cached data was left over
+    {
+        return S_OK;
+    }
+    else if (!fSuccess) // reading failed
+    {
+        return static_cast<HRESULT>(-1);
+    }
+
+    const BYTE* const endPtr{ _buffer + dwRead };
+    const BYTE* backIter{ endPtr - 1 };
+    // If the last byte in the buffer was a byte belonging to a UTF-8 multi-byte character
+    if ((*backIter & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
+    {
+        // Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the buffer
+        for (DWORD dwSequenceLen{ 1UL }, stop{ dwRead < 4UL ? dwRead : 4UL }; dwSequenceLen < stop; ++dwSequenceLen, --backIter)
+        {
+            // If Lead Byte found
+            if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)
+            {
+                // If the Lead Byte indicates that the last bytes in the buffer is a partial UTF-8 code point then cache them:
+                //  Use the bitmask at index `dwSequenceLen`. Compare the result with the operand having the same index. If they
+                //  are not equal then the sequence has to be cached because it is a partial code point. Otherwise the
+                //  sequence is a complete UTF-8 code point and the whole buffer is ready for the conversion to hstring.
+                if ((*backIter & _cmpMasks[dwSequenceLen]) != _cmpOperands[dwSequenceLen])
+                {
+                    std::move(backIter, endPtr, _utf8Partials);
+                    dwRead -= dwSequenceLen;
+                    _dwPartialsLen = dwSequenceLen;
+                }
+
+                break;
+            }
+        }
+    }
+
+    // give back a view of the part of the buffer that contains complete code points only
+    strView = std::string_view{ reinterpret_cast<char*>(_buffer), dwRead };
+    return S_OK;
+}
diff --git a/src/types/inc/UTF8OutPipeReader.hpp b/src/types/inc/UTF8OutPipeReader.hpp
@@ -0,0 +1,69 @@
+/*++
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+
+Module Name:
+- UTF8OutPipeReader.hpp
+
+Abstract:
+- This reads a UTF-8 stream and gives back a buffer that contains complete code points only
+- Partial UTF-8 code points at the end of the buffer read are cached and prepended to the next chunk read
+
+Author(s):
+- Steffen Illhardt (german-one) 12-July-2019
+--*/
+
+#pragma once
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+
+#include <windows.h>
+#include <wil\common.h>
+#include <wil\resource.h>
+#include <string_view>
+
+class UTF8OutPipeReader final
+{
+public:
+    UTF8OutPipeReader(wil::unique_hfile& outPipe);
+    [[nodiscard]] HRESULT Read(_Out_ std::string_view& strView);
+
+private:
+    wil::unique_hfile& _outPipe;
+
+    enum _Utf8BitMasks : BYTE
+    {
+        IsAsciiByte = 0b0'0000000, // Any byte representing an ASCII character has the MSB set to 0
+        MaskAsciiByte = 0b1'0000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsAsciiByte pattern
+        IsContinuationByte = 0b10'000000, // Continuation bytes of any UTF-8 non-ASCII character have the MSB set to 1 and the adjacent bit set to 0
+        MaskContinuationByte = 0b11'000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsContinuationByte pattern
+        IsLeadByteTwoByteSequence = 0b110'00000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of two bytes has the two highest bits set to 1 and the adjacent bit set to 0
+        MaskLeadByteTwoByteSequence = 0b111'00000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteTwoByteSequence pattern
+        IsLeadByteThreeByteSequence = 0b1110'0000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of three bytes has the three highest bits set to 1 and the adjacent bit set to 0
+        MaskLeadByteThreeByteSequence = 0b1111'0000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteThreeByteSequence pattern
+        IsLeadByteFourByteSequence = 0b11110'000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of four bytes has the four highest bits set to 1 and the adjacent bit set to 0
+        MaskLeadByteFourByteSequence = 0b11111'000 // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteFourByteSequence pattern
+    };
+
+    // array of bitmasks
+    constexpr const static BYTE _cmpMasks[]{
+        0, // unused
+        _Utf8BitMasks::MaskContinuationByte,
+        _Utf8BitMasks::MaskLeadByteTwoByteSequence,
+        _Utf8BitMasks::MaskLeadByteThreeByteSequence,
+    };
+
+    // array of values for the comparisons
+    constexpr const static BYTE _cmpOperands[]{
+        0, // unused
+        _Utf8BitMasks::IsAsciiByte, // intentionally conflicts with MaskContinuationByte
+        _Utf8BitMasks::IsLeadByteTwoByteSequence,
+        _Utf8BitMasks::IsLeadByteThreeByteSequence,
+    };
+
+    BYTE _buffer[4096]{ 0 }; // buffer for the chunk read
+    BYTE _utf8Partials[4]{ 0 }; // buffer for code units of a partial UTF-8 code point that have to be cached
+    DWORD _dwPartialsLen{}; // number of cached UTF-8 code units
+};
diff --git a/src/types/lib/types.vcxproj b/src/types/lib/types.vcxproj
@@ -12,6 +12,7 @@
     <ClCompile Include="..\MenuEvent.cpp" />
     <ClCompile Include="..\ModifierKeyState.cpp" />
     <ClCompile Include="..\Utf16Parser.cpp" />
+    <ClCompile Include="..\UTF8OutPipeReader.cpp" />
     <ClCompile Include="..\Viewport.cpp" />
     <ClCompile Include="..\WindowBufferSizeEvent.cpp" />
     <ClCompile Include="..\precomp.cpp">
@@ -24,6 +25,7 @@
     <ClInclude Include="..\inc\convert.hpp" />
     <ClInclude Include="..\inc\GlyphWidth.hpp" />
     <ClInclude Include="..\inc\IInputEvent.hpp" />
+    <ClInclude Include="..\inc\UTF8OutPipeReader.hpp" />
     <ClInclude Include="..\inc\Viewport.hpp" />
     <ClInclude Include="..\inc\Utf16Parser.hpp" />
     <ClInclude Include="..\precomp.h" />

diff --git a/src/types/lib/types.vcxproj.filters b/src/types/lib/types.vcxproj.filters
@@ -57,6 +57,9 @@
     <ClCompile Include="..\utils.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\UTF8OutPipeReader.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\inc\IInputEvent.hpp">
@@ -83,6 +86,9 @@
     <ClInclude Include="..\utils.hpp">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\inc\UTF8OutPipeReader.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Natvis Include="$(SolutionDir)tools\ConsoleTypes.natvis" />

diff --git a/src/types/ut_types/Types.Unit.Tests.vcxproj b/src/types/ut_types/Types.Unit.Tests.vcxproj
@@ -2,6 +2,7 @@
 <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <Import Project="$(SolutionDir)src\common.build.pre.props" />
   <ItemGroup>
+    <ClCompile Include="UTF8OutPipeReaderTests.cpp" />
     <ClCompile Include="UtilsTests.cpp" />
     <ClCompile Include="UuidTests.cpp" />
     <ClCompile Include="..\precomp.cpp">