Commit 138127a6 authored by vogelheim's avatar vogelheim Committed by Commit bot

Fix bad-char handling in utf-8 streaming streams. Also add test.

R=jochen@chromium.org
BUG=chromium:651333, v8:4947

Review-Url: https://codereview.chromium.org/2391273002
Cr-Commit-Position: refs/heads/master@{#40004}
parent 186e7db8
...@@ -286,6 +286,20 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { ...@@ -286,6 +286,20 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_); uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
DCHECK_EQ(cursor, buffer_end_); DCHECK_EQ(cursor, buffer_end_);
// If the current chunk is the last (empty) chunk we'll have to process
// any left-over, partial characters.
if (chunk.length == 0) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncrementalFinish(&current_.pos.incomplete_char);
if (t != unibrow::Utf8::kBufferEmpty) {
DCHECK(t < unibrow::Utf16::kMaxNonSurrogateCharCode);
*cursor = static_cast<uc16>(t);
buffer_end_++;
current_.pos.chars++;
}
return;
}
static const unibrow::uchar kUtf8Bom = 0xfeff; static const unibrow::uchar kUtf8Bom = 0xfeff;
unibrow::Utf8::Utf8IncrementalBuffer incomplete_char = unibrow::Utf8::Utf8IncrementalBuffer incomplete_char =
...@@ -421,7 +435,7 @@ size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) { ...@@ -421,7 +435,7 @@ size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
if (current_.chunk_no == chunks_.size()) { if (current_.chunk_no == chunks_.size()) {
out_of_data = !FetchChunk(); out_of_data = !FetchChunk();
} }
if (!out_of_data) FillBufferFromCurrentChunk(); FillBufferFromCurrentChunk();
} }
DCHECK_EQ(current_.pos.chars - position, buffer_end_ - buffer_cursor_); DCHECK_EQ(current_.pos.chars - position, buffer_end_ - buffer_cursor_);
......
...@@ -333,9 +333,26 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { ...@@ -333,9 +333,26 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
*buffer = 0; *buffer = 0;
return kBadChar; return kBadChar;
} }
} else if (*buffer <= 0xff) {
// We have one unprocessed byte left (from the last else case in this if
// statement).
uchar previous = *buffer;
*buffer = 0;
uchar t = ValueOfIncremental(previous, buffer);
if (t == kIncomplete) {
// If we have an incomplete character, process both the previous and the
// next byte at once.
return ValueOfIncremental(next, buffer);
} else { } else {
// Otherwise, process the previous byte and save the next byte for next
// time.
DCHECK_EQ(0, *buffer);
*buffer = next;
return t;
}
} else if (IsContinuationCharacter(next)) {
// We're inside of a character, as described by buffer. // We're inside of a character, as described by buffer.
if (IsContinuationCharacter(next)) {
// How many bytes (excluding this one) do we still expect? // How many bytes (excluding this one) do we still expect?
uint8_t count = (*buffer >> 24) - 1; uint8_t count = (*buffer >> 24) - 1;
// Update the value. // Update the value.
...@@ -348,10 +365,22 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { ...@@ -348,10 +365,22 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
return value; return value;
} }
} else { } else {
// Within a character, but not a continuation character? Bad char. // Within a character, but not a continuation character? Then the
*buffer = 0; // previous char was a bad char. But we need to save the current
// one.
*buffer = next;
return kBadChar; return kBadChar;
} }
}
uchar Utf8::ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer) {
DCHECK_NOT_NULL(buffer);
if (*buffer == 0) {
return kBufferEmpty;
} else {
// Process left-over chars. An incomplete char at the end maps to kBadChar.
uchar t = ValueOfIncremental(0, buffer);
return (t == kIncomplete) ? kBadChar : t;
} }
} }
......
...@@ -161,6 +161,7 @@ class Utf8 { ...@@ -161,6 +161,7 @@ class Utf8 {
typedef uint32_t Utf8IncrementalBuffer; typedef uint32_t Utf8IncrementalBuffer;
static uchar ValueOfIncremental(byte next_byte, static uchar ValueOfIncremental(byte next_byte,
Utf8IncrementalBuffer* buffer); Utf8IncrementalBuffer* buffer);
static uchar ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer);
// Excludes non-characters from the set of valid code points. // Excludes non-characters from the set of valid code points.
static inline bool IsValidCharacter(uchar c); static inline bool IsValidCharacter(uchar c);
......
...@@ -423,3 +423,26 @@ TEST(CharacterStreams) { ...@@ -423,3 +423,26 @@ TEST(CharacterStreams) {
TestCharacterStreams(buffer, arraysize(buffer) - 1); TestCharacterStreams(buffer, arraysize(buffer) - 1);
TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298); TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);
} }
// Regression test for crbug.com/651333. Read invalid utf-8.
TEST(Regress651333) {
const uint8_t bytes[] =
"A\xf1"
"ad"; // Anad, with n == n-with-tilde.
const uint16_t unicode[] = {65, 65533, 97, 100};
// Run the test for all sub-strings 0..N of bytes, to make sure we hit the
// error condition in and at chunk boundaries.
for (size_t len = 0; len < arraysize(bytes); len++) {
// Read len bytes from bytes, and compare against the expected unicode
// characters. Expect kBadChar ( == Unicode replacement char == code point
// 65533) instead of the incorrectly coded Latin1 char.
ChunkSource chunks(bytes, len, false);
std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
&chunks, v8::ScriptCompiler::StreamedSource::UTF8));
for (size_t i = 0; i < len; i++) {
CHECK_EQ(unicode[i], stream->Advance());
}
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment