Skip to content

Commit 312d3c9

Browse files
german-oneDHowett
authored andcommitted
Fix for UTF-8 partials in function ConhostConnection::_OutputThread. (#1850)
* Fix for UTF-8 partials in functions `ConhostConnection::_OutputThread` and `ApiRoutines::WriteConsoleOutputCharacterAImpl` The implementation needs to check whether or not the buffer ends with a partial character. If so, only convert the code points which are complete, and save the partial code units in a cache that gets prepended to the next chunk of text. * Utf8OutPipeReader class added * Unit Test added * use specific macros and WIL classes * avoid possible deadlock caused by unclosed pipe handle (cherry picked from commit fa5b9b0)
1 parent 2cb9c9b commit 312d3c9

File tree

7 files changed

+325
-18
lines changed

7 files changed

+325
-18
lines changed

src/cascadia/TerminalConnection/ConhostConnection.cpp

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include <conpty-universal.h>
1717
#include "../../types/inc/Utils.hpp"
18+
#include "../../types/inc/UTF8OutPipeReader.hpp"
1819

1920
using namespace ::Microsoft::Console;
2021

@@ -189,39 +190,36 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
189190

190191
DWORD ConhostConnection::_OutputThread()
191192
{
192-
const size_t bufferSize = 4096;
193-
BYTE buffer[bufferSize];
194-
DWORD dwRead;
193+
static UTF8OutPipeReader pipeReader{ _outPipe };
194+
std::string_view strView{};
195+
196+
// process the data of the output pipe in a loop
195197
while (true)
196198
{
197-
dwRead = 0;
198-
bool fSuccess = false;
199-
200-
fSuccess = !!ReadFile(_outPipe.get(), buffer, bufferSize, &dwRead, nullptr);
201-
if (!fSuccess)
199+
HRESULT result = pipeReader.Read(strView);
200+
if (FAILED(result))
202201
{
203202
if (_closing.load())
204203
{
205204
// This is okay, break out to kill the thread
206205
return 0;
207206
}
208-
else
209-
{
210-
_disconnectHandlers();
211-
return (DWORD)-1;
212-
}
207+
208+
_disconnectHandlers();
209+
return (DWORD)-1;
213210
}
214-
if (dwRead == 0)
211+
else if (strView.empty())
215212
{
216-
continue;
213+
return 0;
217214
}
215+
218216
// Convert buffer to hstring
219-
char* pchStr = (char*)(buffer);
220-
std::string str{ pchStr, dwRead };
221-
auto hstr = winrt::to_hstring(str);
217+
auto hstr{ winrt::to_hstring(strView) };
222218

223219
// Pass the output to our registered event handlers
224220
_outputHandlers(hstr);
225221
}
222+
223+
return 0;
226224
}
227225
}

src/types/UTF8OutPipeReader.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT license.
3+
4+
#include "precomp.h"
5+
#include "inc/Utf8OutPipeReader.hpp"
6+
#include <type_traits>
7+
#include <utility>
8+
9+
UTF8OutPipeReader::UTF8OutPipeReader(wil::unique_hfile& outPipe) :
10+
_outPipe{ outPipe }
11+
{
12+
}
13+
14+
[[nodiscard]] HRESULT UTF8OutPipeReader::Read(_Out_ std::string_view& strView)
15+
{
16+
DWORD dwRead{};
17+
bool fSuccess{};
18+
19+
// in case of early escaping
20+
*_buffer = 0;
21+
strView = reinterpret_cast<char*>(_buffer);
22+
23+
// copy UTF-8 code units that were remaining from the previously read chunk (if any)
24+
if (_dwPartialsLen != 0)
25+
{
26+
std::move(_utf8Partials, _utf8Partials + _dwPartialsLen, _buffer);
27+
}
28+
29+
// try to read data
30+
fSuccess = !!ReadFile(_outPipe.get(), &_buffer[_dwPartialsLen], std::extent<decltype(_buffer)>::value - _dwPartialsLen, &dwRead, nullptr);
31+
32+
dwRead += _dwPartialsLen;
33+
_dwPartialsLen = 0;
34+
35+
if (dwRead == 0) // quit if no data has been read and no cached data was left over
36+
{
37+
return S_OK;
38+
}
39+
else if (!fSuccess) // reading failed
40+
{
41+
return E_FAIL;
42+
}
43+
44+
const BYTE* const endPtr{ _buffer + dwRead };
45+
const BYTE* backIter{ endPtr - 1 };
46+
// If the last byte in the buffer was a byte belonging to a UTF-8 multi-byte character
47+
if ((*backIter & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
48+
{
49+
// Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the buffer
50+
for (DWORD dwSequenceLen{ 1UL }, stop{ dwRead < 4UL ? dwRead : 4UL }; dwSequenceLen < stop; ++dwSequenceLen, --backIter)
51+
{
52+
// If Lead Byte found
53+
if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)
54+
{
55+
// If the Lead Byte indicates that the last bytes in the buffer is a partial UTF-8 code point then cache them:
56+
// Use the bitmask at index `dwSequenceLen`. Compare the result with the operand having the same index. If they
57+
// are not equal then the sequence has to be cached because it is a partial code point. Otherwise the
58+
// sequence is a complete UTF-8 code point and the whole buffer is ready for the conversion to hstring.
59+
if ((*backIter & _cmpMasks[dwSequenceLen]) != _cmpOperands[dwSequenceLen])
60+
{
61+
std::move(backIter, endPtr, _utf8Partials);
62+
dwRead -= dwSequenceLen;
63+
_dwPartialsLen = dwSequenceLen;
64+
}
65+
66+
break;
67+
}
68+
}
69+
}
70+
71+
// give back a view of the part of the buffer that contains complete code points only
72+
strView = std::string_view{ reinterpret_cast<char*>(_buffer), dwRead };
73+
return S_OK;
74+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*++
2+
Copyright (c) Microsoft Corporation
3+
Licensed under the MIT license.
4+
5+
Module Name:
6+
- UTF8OutPipeReader.hpp
7+
8+
Abstract:
9+
- This reads a UTF-8 stream and gives back a buffer that contains complete code points only
10+
- Partial UTF-8 code points at the end of the buffer read are cached and prepended to the next chunk read
11+
12+
Author(s):
13+
- Steffen Illhardt (german-one) 12-July-2019
14+
--*/
15+
16+
#pragma once
17+
18+
#ifndef WIN32_LEAN_AND_MEAN
19+
#define WIN32_LEAN_AND_MEAN
20+
#endif
21+
22+
#include <windows.h>
23+
#include <wil\common.h>
24+
#include <wil\resource.h>
25+
#include <string_view>
26+
27+
class UTF8OutPipeReader final
28+
{
29+
public:
30+
UTF8OutPipeReader(wil::unique_hfile& outPipe);
31+
[[nodiscard]] HRESULT Read(_Out_ std::string_view& strView);
32+
33+
private:
34+
wil::unique_hfile& _outPipe;
35+
36+
enum _Utf8BitMasks : BYTE
37+
{
38+
IsAsciiByte = 0b0'0000000, // Any byte representing an ASCII character has the MSB set to 0
39+
MaskAsciiByte = 0b1'0000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsAsciiByte pattern
40+
IsContinuationByte = 0b10'000000, // Continuation bytes of any UTF-8 non-ASCII character have the MSB set to 1 and the adjacent bit set to 0
41+
MaskContinuationByte = 0b11'000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsContinuationByte pattern
42+
IsLeadByteTwoByteSequence = 0b110'00000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of two bytes has the two highest bits set to 1 and the adjacent bit set to 0
43+
MaskLeadByteTwoByteSequence = 0b111'00000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteTwoByteSequence pattern
44+
IsLeadByteThreeByteSequence = 0b1110'0000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of three bytes has the three highest bits set to 1 and the adjacent bit set to 0
45+
MaskLeadByteThreeByteSequence = 0b1111'0000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteThreeByteSequence pattern
46+
IsLeadByteFourByteSequence = 0b11110'000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of four bytes has the four highest bits set to 1 and the adjacent bit set to 0
47+
MaskLeadByteFourByteSequence = 0b11111'000 // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteFourByteSequence pattern
48+
};
49+
50+
// array of bitmasks
51+
constexpr const static BYTE _cmpMasks[]{
52+
0, // unused
53+
_Utf8BitMasks::MaskContinuationByte,
54+
_Utf8BitMasks::MaskLeadByteTwoByteSequence,
55+
_Utf8BitMasks::MaskLeadByteThreeByteSequence,
56+
};
57+
58+
// array of values for the comparisons
59+
constexpr const static BYTE _cmpOperands[]{
60+
0, // unused
61+
_Utf8BitMasks::IsAsciiByte, // intentionally conflicts with MaskContinuationByte
62+
_Utf8BitMasks::IsLeadByteTwoByteSequence,
63+
_Utf8BitMasks::IsLeadByteThreeByteSequence,
64+
};
65+
66+
BYTE _buffer[4096]{ 0 }; // buffer for the chunk read
67+
BYTE _utf8Partials[4]{ 0 }; // buffer for code units of a partial UTF-8 code point that have to be cached
68+
DWORD _dwPartialsLen{}; // number of cached UTF-8 code units
69+
};

src/types/lib/types.vcxproj

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
<ClCompile Include="..\MenuEvent.cpp" />
1313
<ClCompile Include="..\ModifierKeyState.cpp" />
1414
<ClCompile Include="..\Utf16Parser.cpp" />
15+
<ClCompile Include="..\UTF8OutPipeReader.cpp" />
1516
<ClCompile Include="..\Viewport.cpp" />
1617
<ClCompile Include="..\WindowBufferSizeEvent.cpp" />
1718
<ClCompile Include="..\precomp.cpp">
@@ -24,6 +25,7 @@
2425
<ClInclude Include="..\inc\convert.hpp" />
2526
<ClInclude Include="..\inc\GlyphWidth.hpp" />
2627
<ClInclude Include="..\inc\IInputEvent.hpp" />
28+
<ClInclude Include="..\inc\UTF8OutPipeReader.hpp" />
2729
<ClInclude Include="..\inc\Viewport.hpp" />
2830
<ClInclude Include="..\inc\Utf16Parser.hpp" />
2931
<ClInclude Include="..\precomp.h" />

src/types/lib/types.vcxproj.filters

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@
5757
<ClCompile Include="..\utils.cpp">
5858
<Filter>Source Files</Filter>
5959
</ClCompile>
60+
<ClCompile Include="..\UTF8OutPipeReader.cpp">
61+
<Filter>Source Files</Filter>
62+
</ClCompile>
6063
</ItemGroup>
6164
<ItemGroup>
6265
<ClInclude Include="..\inc\IInputEvent.hpp">
@@ -83,6 +86,9 @@
8386
<ClInclude Include="..\utils.hpp">
8487
<Filter>Header Files</Filter>
8588
</ClInclude>
89+
<ClInclude Include="..\inc\UTF8OutPipeReader.hpp">
90+
<Filter>Header Files</Filter>
91+
</ClInclude>
8692
</ItemGroup>
8793
<ItemGroup>
8894
<Natvis Include="$(SolutionDir)tools\ConsoleTypes.natvis" />

src/types/ut_types/Types.Unit.Tests.vcxproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
33
<Import Project="$(SolutionDir)src\common.build.pre.props" />
44
<ItemGroup>
5+
<ClCompile Include="UTF8OutPipeReaderTests.cpp" />
56
<ClCompile Include="UtilsTests.cpp" />
67
<ClCompile Include="UuidTests.cpp" />
78
<ClCompile Include="..\precomp.cpp">

0 commit comments

Comments
 (0)