Commit 138127a6 authored by vogelheim's avatar vogelheim Committed by Commit bot

Fix bad-char handling in utf-8 streaming streams. Also add test.

R=jochen@chromium.org
BUG=chromium:651333, v8:4947

Review-Url: https://codereview.chromium.org/2391273002
Cr-Commit-Position: refs/heads/master@{#40004}
parent 186e7db8
......@@ -286,6 +286,20 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
DCHECK_EQ(cursor, buffer_end_);
// If the current chunk is the last (empty) chunk we'll have to process
// any left-over, partial characters.
if (chunk.length == 0) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncrementalFinish(&current_.pos.incomplete_char);
if (t != unibrow::Utf8::kBufferEmpty) {
DCHECK(t < unibrow::Utf16::kMaxNonSurrogateCharCode);
*cursor = static_cast<uc16>(t);
buffer_end_++;
current_.pos.chars++;
}
return;
}
static const unibrow::uchar kUtf8Bom = 0xfeff;
unibrow::Utf8::Utf8IncrementalBuffer incomplete_char =
......@@ -421,7 +435,7 @@ size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
if (current_.chunk_no == chunks_.size()) {
out_of_data = !FetchChunk();
}
if (!out_of_data) FillBufferFromCurrentChunk();
FillBufferFromCurrentChunk();
}
DCHECK_EQ(current_.pos.chars - position, buffer_end_ - buffer_cursor_);
......
......@@ -333,25 +333,54 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
*buffer = 0;
return kBadChar;
}
} else {
} else if (*buffer <= 0xff) {
// We have one unprocessed byte left (from the last else case in this if
// statement).
uchar previous = *buffer;
*buffer = 0;
uchar t = ValueOfIncremental(previous, buffer);
if (t == kIncomplete) {
// If we have an incomplete character, process both the previous and the
// next byte at once.
return ValueOfIncremental(next, buffer);
} else {
// Otherwise, process the previous byte and save the next byte for next
// time.
DCHECK_EQ(0, *buffer);
*buffer = next;
return t;
}
} else if (IsContinuationCharacter(next)) {
// We're inside of a character, as described by buffer.
if (IsContinuationCharacter(next)) {
// How many bytes (excluding this one) do we still expect?
uint8_t count = (*buffer >> 24) - 1;
// Update the value.
uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
if (count) {
*buffer = count << 24 | value;
return kIncomplete;
} else {
*buffer = 0;
return value;
}
// How many bytes (excluding this one) do we still expect?
uint8_t count = (*buffer >> 24) - 1;
// Update the value.
uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
if (count) {
*buffer = count << 24 | value;
return kIncomplete;
} else {
// Within a character, but not a continuation character? Bad char.
*buffer = 0;
return kBadChar;
return value;
}
} else {
// Within a character, but not a continuation character? Then the
// previous char was a bad char. But we need to save the current
// one.
*buffer = next;
return kBadChar;
}
}
uchar Utf8::ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer) {
DCHECK_NOT_NULL(buffer);
if (*buffer == 0) {
return kBufferEmpty;
} else {
// Process left-over chars. An incomplete char at the end maps to kBadChar.
uchar t = ValueOfIncremental(0, buffer);
return (t == kIncomplete) ? kBadChar : t;
}
}
......
......@@ -161,6 +161,7 @@ class Utf8 {
typedef uint32_t Utf8IncrementalBuffer;
static uchar ValueOfIncremental(byte next_byte,
Utf8IncrementalBuffer* buffer);
static uchar ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer);
// Excludes non-characters from the set of valid code points.
static inline bool IsValidCharacter(uchar c);
......
......@@ -423,3 +423,26 @@ TEST(CharacterStreams) {
TestCharacterStreams(buffer, arraysize(buffer) - 1);
TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);
}
// Regression test for crbug.com/651333. Read invalid utf-8.
TEST(Regress651333) {
const uint8_t bytes[] =
"A\xf1"
"ad"; // Anad, with n == n-with-tilde.
const uint16_t unicode[] = {65, 65533, 97, 100};
// Run the test for all sub-strings 0..N of bytes, to make sure we hit the
// error condition in and at chunk boundaries.
for (size_t len = 0; len < arraysize(bytes); len++) {
// Read len bytes from bytes, and compare against the expected unicode
// characters. Expect kBadChar ( == Unicode replacement char == code point
// 65533) instead of the incorrectly coded Latin1 char.
ChunkSource chunks(bytes, len, false);
std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
&chunks, v8::ScriptCompiler::StreamedSource::UTF8));
for (size_t i = 0; i < len; i++) {
CHECK_EQ(unicode[i], stream->Advance());
}
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment