-
Notifications
You must be signed in to change notification settings - Fork 9k
Fix for UTF-8 partials in function ConhostConnection::_OutputThread.
#1850
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
ea46579
7c83c38
7a4a814
a424a16
85b5509
5780770
d0e6e82
d98c589
7df6609
507f526
7d8c4b1
f08f31c
bb299d7
c395f3b
23c4286
99d0713
c877edd
e35a78f
31db29b
d09ac87
f23bcd9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -5,6 +5,9 @@ | |||||||||||||||||||
| #include "ConhostConnection.h" | ||||||||||||||||||||
| #include "windows.h" | ||||||||||||||||||||
| #include <sstream> | ||||||||||||||||||||
| #include <string_view> | ||||||||||||||||||||
| #include <algorithm> | ||||||||||||||||||||
| #include <type_traits> | ||||||||||||||||||||
| // STARTF_USESTDHANDLES is only defined in WINAPI_PARTITION_DESKTOP | ||||||||||||||||||||
| // We're just gonna manually define it for this prototyping code | ||||||||||||||||||||
| #ifndef STARTF_USESTDHANDLES | ||||||||||||||||||||
|
|
@@ -190,38 +193,101 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation | |||||||||||||||||||
| DWORD ConhostConnection::_OutputThread() | ||||||||||||||||||||
| { | ||||||||||||||||||||
| const size_t bufferSize = 4096; | ||||||||||||||||||||
| BYTE buffer[bufferSize]; | ||||||||||||||||||||
| DWORD dwRead; | ||||||||||||||||||||
| BYTE buffer[bufferSize]{ 0 }; | ||||||||||||||||||||
| BYTE utf8Partials[4]{ 0 }; // buffer for code units of a partial UTF-8 code point that have to be cached | ||||||||||||||||||||
| DWORD dwRead{}; // length of a chunk read | ||||||||||||||||||||
| DWORD dwPartialsLen{}; // number of cached UTF-8 code units | ||||||||||||||||||||
| bool fSuccess{}; | ||||||||||||||||||||
| bool fStreamBegin{}; // indicates whether text is left in utf8Partials after the BOM check | ||||||||||||||||||||
| const BYTE bitmasks[]{ 0, 0b11000000, 0b11100000, 0b11110000 }; // for comparisons after the Lead Byte was found | ||||||||||||||||||||
|
|
||||||||||||||||||||
| // check whether the UTF-8 stream has a Byte Order Mark | ||||||||||||||||||||
| fSuccess = !!ReadFile(_outPipe.get(), utf8Partials, 3UL, &dwPartialsLen, nullptr); | ||||||||||||||||||||
| if (!fSuccess) | ||||||||||||||||||||
| { | ||||||||||||||||||||
| if (_closing.load()) | ||||||||||||||||||||
| { | ||||||||||||||||||||
| // This is okay, break out to kill the thread | ||||||||||||||||||||
| return 0; | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| _disconnectHandlers(); | ||||||||||||||||||||
| return (DWORD)-1; | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| if (utf8Partials[0] == 0xEF && utf8Partials[1] == 0xBB && utf8Partials[2] == 0xBF) | ||||||||||||||||||||
| { | ||||||||||||||||||||
| dwPartialsLen = 0; // discard the BOM | ||||||||||||||||||||
| } | ||||||||||||||||||||
| else | ||||||||||||||||||||
| { | ||||||||||||||||||||
| fStreamBegin = true; | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| // process the data of the standard input in a loop | ||||||||||||||||||||
| while (true) | ||||||||||||||||||||
| { | ||||||||||||||||||||
| dwRead = 0; | ||||||||||||||||||||
| bool fSuccess = false; | ||||||||||||||||||||
| // copy UTF-8 code units that were remaining from the previously read chunk (if any) | ||||||||||||||||||||
| if (dwPartialsLen != 0) | ||||||||||||||||||||
| { | ||||||||||||||||||||
| std::move(utf8Partials, utf8Partials + dwPartialsLen, buffer); | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| fSuccess = !!ReadFile(_outPipe.get(), buffer, bufferSize, &dwRead, nullptr); | ||||||||||||||||||||
| if (!fSuccess) | ||||||||||||||||||||
| // try to read data | ||||||||||||||||||||
| fSuccess = !!ReadFile(_outPipe.get(), &buffer[dwPartialsLen], std::extent<decltype(buffer)>::value - dwPartialsLen, &dwRead, nullptr); | ||||||||||||||||||||
|
|
||||||||||||||||||||
| dwRead += dwPartialsLen; | ||||||||||||||||||||
| dwPartialsLen = 0; | ||||||||||||||||||||
|
|
||||||||||||||||||||
| if (dwRead == 0) // quit if no data has been read and no cached data was left over | ||||||||||||||||||||
| { | ||||||||||||||||||||
| return 0; | ||||||||||||||||||||
| } | ||||||||||||||||||||
| else if (!fSuccess && !fStreamBegin) // cached data without bytes for completion in the buffer | ||||||||||||||||||||
| { | ||||||||||||||||||||
| if (_closing.load()) | ||||||||||||||||||||
| { | ||||||||||||||||||||
| // This is okay, break out to kill the thread | ||||||||||||||||||||
| return 0; | ||||||||||||||||||||
| } | ||||||||||||||||||||
| else | ||||||||||||||||||||
| { | ||||||||||||||||||||
| _disconnectHandlers(); | ||||||||||||||||||||
| return (DWORD)-1; | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| _disconnectHandlers(); | ||||||||||||||||||||
| return (DWORD)-1; | ||||||||||||||||||||
| } | ||||||||||||||||||||
| if (dwRead == 0) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| const BYTE* const endPtr{ buffer + dwRead }; | ||||||||||||||||||||
| const BYTE* backIter{ endPtr - 1 }; | ||||||||||||||||||||
| // If the last byte in the buffer was a byte belonging to a UTF-8 multi-byte character | ||||||||||||||||||||
| if ((*backIter & 0b10000000) == 0b10000000) | ||||||||||||||||||||
| { | ||||||||||||||||||||
| continue; | ||||||||||||||||||||
| // Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the buffer | ||||||||||||||||||||
| for (DWORD dwSequenceLen{ 1UL }, stop{ dwRead < 4UL ? dwRead : 4UL }; dwSequenceLen < stop; ++dwSequenceLen, --backIter) | ||||||||||||||||||||
| { | ||||||||||||||||||||
| // If Lead Byte found | ||||||||||||||||||||
| if ((*backIter & 0b11000000) == 0b11000000) | ||||||||||||||||||||
|
||||||||||||||||||||
| const byte NonAsciiBytePrefix = 0x80; | |
| const byte ContinuationByteMask = 0xC0; | |
| const byte ContinuationBytePrefix = 0x80; | |
| const byte MostSignificantBitMask = 0x80; |
Different names for the same value. Doesn't include all values used here.
terminal/src/host/ut_host/Utf8ToWideCharParserTests.cpp
Lines 296 to 298 in 9b92986
| VERIFY_IS_TRUE(parser._IsLeadByte(0xC0)); // 2 byte sequence | |
| VERIFY_IS_TRUE(parser._IsLeadByte(0xE0)); // 3 byte sequence | |
| VERIFY_IS_TRUE(parser._IsLeadByte(0xF0)); // 4 byte sequence |
Better. But names are only in the comments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm personally okay without the names 😄
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can't agree more.
Yesterday I fiddled with an enum and member names like LeadByteTwoByteSequence. Then I used these names in the array and noticed that they are ambiguous. E.g. we use value 0b11100000 as bitmask in the AND operation and compare the result with 0b11000000. In the end each of these values would need two names. For example something like MaskLeadByteTwoByteSequence and IsLeadByteThreeByteSequence for value 0b11100000. Since we can't use two names for an array element, we would need a second array to disambiguate the element names. I don't like it because it's a mess and it doesn't improve the readability imo. When I tried it yesterday I actually confused myself...
In contrast, the binary literals are quite self-explanatory.
Uh oh!
There was an error while loading. Please reload this page.