Fix for UTF-8 partials in function ConhostConnection::_OutputThread. (#1850)

german-one · DHowett · commit 312d3c93de4d · 2019-07-16T20:45:44.000-07:00
* Fix for UTF-8 partials in functions `ConhostConnection::_OutputThread` and `ApiRoutines::WriteConsoleOutputCharacterAImpl` The implementation needs to check whether or not the buffer ends with a partial character. If so, only convert the code points which are complete, and save the partial code units in a cache that gets prepended to the next chunk of text. * Utf8OutPipeReader class added * Unit Test added * use specific macros and WIL classes * avoid possible deadlock caused by unclosed pipe handle (cherry picked from commit fa5b9b0)
diff --git a/src/cascadia/TerminalConnection/ConhostConnection.cpp b/src/cascadia/TerminalConnection/ConhostConnection.cpp
@@ -15,6 +15,7 @@
 
 #include <conpty-universal.h>
 #include "../../types/inc/Utils.hpp"
+#include "../../types/inc/UTF8OutPipeReader.hpp"
 
 using namespace ::Microsoft::Console;
 
@@ -189,39 +190,36 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
 
     DWORD ConhostConnection::_OutputThread()
     {
-        const size_t bufferSize = 4096;
-        BYTE buffer[bufferSize];
-        DWORD dwRead;
+        static UTF8OutPipeReader pipeReader{ _outPipe };
+        std::string_view strView{};
+
+        // process the data of the output pipe in a loop
         while (true)
         {
-            dwRead = 0;
-            bool fSuccess = false;
-
-            fSuccess = !!ReadFile(_outPipe.get(), buffer, bufferSize, &dwRead, nullptr);
-            if (!fSuccess)
+            HRESULT result = pipeReader.Read(strView);
+            if (FAILED(result))
             {
                 if (_closing.load())
                 {
                     // This is okay, break out to kill the thread
                     return 0;
                 }
-                else
-                {
-                    _disconnectHandlers();
-                    return (DWORD)-1;
-                }
+
+                _disconnectHandlers();
+                return (DWORD)-1;
             }
-            if (dwRead == 0)
+            else if (strView.empty())
             {
-                continue;
+                return 0;
             }
+
             // Convert buffer to hstring
-            char* pchStr = (char*)(buffer);
-            std::string str{ pchStr, dwRead };
-            auto hstr = winrt::to_hstring(str);
+            auto hstr{ winrt::to_hstring(strView) };
 
             // Pass the output to our registered event handlers
             _outputHandlers(hstr);
         }
+
+        return 0;
     }
 }
diff --git a/src/types/UTF8OutPipeReader.cpp b/src/types/UTF8OutPipeReader.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "precomp.h"
+#include "inc/Utf8OutPipeReader.hpp"
+#include <type_traits>
+#include <utility>
+
+UTF8OutPipeReader::UTF8OutPipeReader(wil::unique_hfile& outPipe) :
+    _outPipe{ outPipe }
+{
+}
+
+[[nodiscard]] HRESULT UTF8OutPipeReader::Read(_Out_ std::string_view& strView)
+{
+    DWORD dwRead{};
+    bool fSuccess{};
+
+    // in case of early escaping
+    *_buffer = 0;
+    strView = reinterpret_cast<char*>(_buffer);
+
+    // copy UTF-8 code units that were remaining from the previously read chunk (if any)
+    if (_dwPartialsLen != 0)
+    {
+        std::move(_utf8Partials, _utf8Partials + _dwPartialsLen, _buffer);
+    }
+
+    // try to read data
+    fSuccess = !!ReadFile(_outPipe.get(), &_buffer[_dwPartialsLen], std::extent<decltype(_buffer)>::value - _dwPartialsLen, &dwRead, nullptr);
+
+    dwRead += _dwPartialsLen;
+    _dwPartialsLen = 0;
+
+    if (dwRead == 0) // quit if no data has been read and no cached data was left over
+    {
+        return S_OK;
+    }
+    else if (!fSuccess) // reading failed
+    {
+        return E_FAIL;
+    }
+
+    const BYTE* const endPtr{ _buffer + dwRead };
+    const BYTE* backIter{ endPtr - 1 };
+    // If the last byte in the buffer was a byte belonging to a UTF-8 multi-byte character
+    if ((*backIter & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
+    {
+        // Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the buffer
+        for (DWORD dwSequenceLen{ 1UL }, stop{ dwRead < 4UL ? dwRead : 4UL }; dwSequenceLen < stop; ++dwSequenceLen, --backIter)
+        {
+            // If Lead Byte found
+            if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)
+            {
+                // If the Lead Byte indicates that the last bytes in the buffer is a partial UTF-8 code point then cache them:
+                //  Use the bitmask at index `dwSequenceLen`. Compare the result with the operand having the same index. If they
+                //  are not equal then the sequence has to be cached because it is a partial code point. Otherwise the
+                //  sequence is a complete UTF-8 code point and the whole buffer is ready for the conversion to hstring.
+                if ((*backIter & _cmpMasks[dwSequenceLen]) != _cmpOperands[dwSequenceLen])
+                {
+                    std::move(backIter, endPtr, _utf8Partials);
+                    dwRead -= dwSequenceLen;
+                    _dwPartialsLen = dwSequenceLen;
+                }
+
+                break;
+            }
+        }
+    }
+
+    // give back a view of the part of the buffer that contains complete code points only
+    strView = std::string_view{ reinterpret_cast<char*>(_buffer), dwRead };
+    return S_OK;
+}
diff --git a/src/types/inc/UTF8OutPipeReader.hpp b/src/types/inc/UTF8OutPipeReader.hpp
@@ -0,0 +1,69 @@
+/*++
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+
+Module Name:
+- UTF8OutPipeReader.hpp
+
+Abstract:
+- This reads a UTF-8 stream and gives back a buffer that contains complete code points only
+- Partial UTF-8 code points at the end of the buffer read are cached and prepended to the next chunk read
+
+Author(s):
+- Steffen Illhardt (german-one) 12-July-2019
+--*/
+
+#pragma once
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+
+#include <windows.h>
+#include <wil\common.h>
+#include <wil\resource.h>
+#include <string_view>
+
+class UTF8OutPipeReader final
+{
+public:
+    UTF8OutPipeReader(wil::unique_hfile& outPipe);
+    [[nodiscard]] HRESULT Read(_Out_ std::string_view& strView);
+
+private:
+    wil::unique_hfile& _outPipe;
+
+    enum _Utf8BitMasks : BYTE
+    {
+        IsAsciiByte = 0b0'0000000, // Any byte representing an ASCII character has the MSB set to 0
+        MaskAsciiByte = 0b1'0000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsAsciiByte pattern
+        IsContinuationByte = 0b10'000000, // Continuation bytes of any UTF-8 non-ASCII character have the MSB set to 1 and the adjacent bit set to 0
+        MaskContinuationByte = 0b11'000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsContinuationByte pattern
+        IsLeadByteTwoByteSequence = 0b110'00000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of two bytes has the two highest bits set to 1 and the adjacent bit set to 0
+        MaskLeadByteTwoByteSequence = 0b111'00000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteTwoByteSequence pattern
+        IsLeadByteThreeByteSequence = 0b1110'0000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of three bytes has the three highest bits set to 1 and the adjacent bit set to 0
+        MaskLeadByteThreeByteSequence = 0b1111'0000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteThreeByteSequence pattern
+        IsLeadByteFourByteSequence = 0b11110'000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of four bytes has the four highest bits set to 1 and the adjacent bit set to 0
+        MaskLeadByteFourByteSequence = 0b11111'000 // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteFourByteSequence pattern
+    };
+
+    // array of bitmasks
+    constexpr const static BYTE _cmpMasks[]{
+        0, // unused
+        _Utf8BitMasks::MaskContinuationByte,
+        _Utf8BitMasks::MaskLeadByteTwoByteSequence,
+        _Utf8BitMasks::MaskLeadByteThreeByteSequence,
+    };
+
+    // array of values for the comparisons
+    constexpr const static BYTE _cmpOperands[]{
+        0, // unused
+        _Utf8BitMasks::IsAsciiByte, // intentionally conflicts with MaskContinuationByte
+        _Utf8BitMasks::IsLeadByteTwoByteSequence,
+        _Utf8BitMasks::IsLeadByteThreeByteSequence,
+    };
+
+    BYTE _buffer[4096]{ 0 }; // buffer for the chunk read
+    BYTE _utf8Partials[4]{ 0 }; // buffer for code units of a partial UTF-8 code point that have to be cached
+    DWORD _dwPartialsLen{}; // number of cached UTF-8 code units
+};
diff --git a/src/types/lib/types.vcxproj b/src/types/lib/types.vcxproj
@@ -12,6 +12,7 @@
     <ClCompile Include="..\MenuEvent.cpp" />
     <ClCompile Include="..\ModifierKeyState.cpp" />
     <ClCompile Include="..\Utf16Parser.cpp" />
+    <ClCompile Include="..\UTF8OutPipeReader.cpp" />
     <ClCompile Include="..\Viewport.cpp" />
     <ClCompile Include="..\WindowBufferSizeEvent.cpp" />
     <ClCompile Include="..\precomp.cpp">
@@ -24,6 +25,7 @@
     <ClInclude Include="..\inc\convert.hpp" />
     <ClInclude Include="..\inc\GlyphWidth.hpp" />
     <ClInclude Include="..\inc\IInputEvent.hpp" />
+    <ClInclude Include="..\inc\UTF8OutPipeReader.hpp" />
     <ClInclude Include="..\inc\Viewport.hpp" />
     <ClInclude Include="..\inc\Utf16Parser.hpp" />
     <ClInclude Include="..\precomp.h" />
diff --git a/src/types/lib/types.vcxproj.filters b/src/types/lib/types.vcxproj.filters
@@ -57,6 +57,9 @@
     <ClCompile Include="..\utils.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\UTF8OutPipeReader.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\inc\IInputEvent.hpp">
@@ -83,6 +86,9 @@
     <ClInclude Include="..\utils.hpp">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\inc\UTF8OutPipeReader.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Natvis Include="$(SolutionDir)tools\ConsoleTypes.natvis" />
diff --git a/src/types/ut_types/Types.Unit.Tests.vcxproj b/src/types/ut_types/Types.Unit.Tests.vcxproj
@@ -2,6 +2,7 @@
 <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <Import Project="$(SolutionDir)src\common.build.pre.props" />
   <ItemGroup>
+    <ClCompile Include="UTF8OutPipeReaderTests.cpp" />
     <ClCompile Include="UtilsTests.cpp" />
     <ClCompile Include="UuidTests.cpp" />
     <ClCompile Include="..\precomp.cpp">
diff --git a/src/types/ut_types/UTF8OutPipeReaderTests.cpp b/src/types/ut_types/UTF8OutPipeReaderTests.cpp

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`
`16`	`16`	`#include <conpty-universal.h>`
`17`	`17`	`#include "../../types/inc/Utils.hpp"`
	`18`	`+#include "../../types/inc/UTF8OutPipeReader.hpp"`
`18`	`19`
`19`	`20`	`using namespace ::Microsoft::Console;`
`20`	`21`
`@@ -189,39 +190,36 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation`
`189`	`190`
`190`	`191`	`DWORD ConhostConnection::_OutputThread()`
`191`	`192`	`{`
`192`		`- const size_t bufferSize = 4096;`
`193`		`- BYTE buffer[bufferSize];`
`194`		`- DWORD dwRead;`
	`193`	`+ static UTF8OutPipeReader pipeReader{ _outPipe };`
	`194`	`+ std::string_view strView{};`
	`195`	`+`
	`196`	`+ // process the data of the output pipe in a loop`
`195`	`197`	`while (true)`
`196`	`198`	`{`
`197`		`- dwRead = 0;`
`198`		`- bool fSuccess = false;`
`199`		`-`
`200`		`- fSuccess = !!ReadFile(_outPipe.get(), buffer, bufferSize, &dwRead, nullptr);`
`201`		`- if (!fSuccess)`
	`199`	`+ HRESULT result = pipeReader.Read(strView);`
	`200`	`+ if (FAILED(result))`
`202`	`201`	`{`
`203`	`202`	`if (_closing.load())`
`204`	`203`	`{`
`205`	`204`	`// This is okay, break out to kill the thread`
`206`	`205`	`return 0;`
`207`	`206`	`}`
`208`		`- else`
`209`		`- {`
`210`		`- _disconnectHandlers();`
`211`		`- return (DWORD)-1;`
`212`		`- }`
	`207`	`+`
	`208`	`+ _disconnectHandlers();`
	`209`	`+ return (DWORD)-1;`
`213`	`210`	`}`
`214`		`- if (dwRead == 0)`
	`211`	`+ else if (strView.empty())`
`215`	`212`	`{`
`216`		`- continue;`
	`213`	`+ return 0;`
`217`	`214`	`}`
	`215`	`+`
`218`	`216`	`// Convert buffer to hstring`
`219`		`- char* pchStr = (char*)(buffer);`
`220`		`- std::string str{ pchStr, dwRead };`
`221`		`- auto hstr = winrt::to_hstring(str);`
	`217`	`+ auto hstr{ winrt::to_hstring(strView) };`
`222`	`218`
`223`	`219`	`// Pass the output to our registered event handlers`
`224`	`220`	`_outputHandlers(hstr);`
`225`	`221`	`}`
	`222`	`+`
	`223`	`+ return 0;`
`226`	`224`	`}`
`227`	`225`	`}`