Commit a2b8b6e7 authored by vogelheim's avatar vogelheim Committed by Commit bot

Handle Utf-8 BOM at beginning of an Utf-8 stream.

(This should enable to drop the BOM handling in the Blink bindings.)

R=marja@chromium.org
BUG=v8:4947

Review-Url: https://codereview.chromium.org/2354973002
Cr-Commit-Position: refs/heads/master@{#39579}
parent a7455beb
...@@ -286,6 +286,8 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { ...@@ -286,6 +286,8 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_); uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
DCHECK_EQ(cursor, buffer_end_); DCHECK_EQ(cursor, buffer_end_);
static const unibrow::uchar kUtf8Bom = 0xfeff;
unibrow::Utf8::Utf8IncrementalBuffer incomplete_char = unibrow::Utf8::Utf8IncrementalBuffer incomplete_char =
current_.pos.incomplete_char; current_.pos.incomplete_char;
size_t it; size_t it;
...@@ -294,7 +296,11 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { ...@@ -294,7 +296,11 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
unibrow::uchar t = unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(chunk.data[it], &incomplete_char); unibrow::Utf8::ValueOfIncremental(chunk.data[it], &incomplete_char);
if (t == unibrow::Utf8::kIncomplete) continue; if (t == unibrow::Utf8::kIncomplete) continue;
if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) { if (V8_LIKELY(t < kUtf8Bom)) {
*(cursor++) = static_cast<uc16>(t); // The by most frequent case.
} else if (t == kUtf8Bom && current_.pos.bytes + it == 2) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(cursor++) = static_cast<uc16>(t); *(cursor++) = static_cast<uc16>(t);
} else { } else {
*(cursor++) = unibrow::Utf16::LeadSurrogate(t); *(cursor++) = unibrow::Utf16::LeadSurrogate(t);
......
...@@ -109,6 +109,67 @@ TEST(Utf8StreamAsciiOnly) { ...@@ -109,6 +109,67 @@ TEST(Utf8StreamAsciiOnly) {
} while (c != v8::internal::Utf16CharacterStream::kEndOfInput); } while (c != v8::internal::Utf16CharacterStream::kEndOfInput);
} }
TEST(Utf8StreamBOM) {
// Construct test string w/ UTF-8 BOM (byte order mark)
char data[3 + arraysize(unicode_utf8)] = {"\xef\xbb\xbf"};
strncpy(data + 3, unicode_utf8, arraysize(unicode_utf8));
const char* chunks[] = {data, "\0"};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
// Read the data without tripping over the BOM.
for (size_t i = 0; unicode_ucs2[i]; i++) {
CHECK_EQ(unicode_ucs2[i], stream->Advance());
}
CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput, stream->Advance());
// Make sure seek works.
stream->Seek(0);
CHECK_EQ(unicode_ucs2[0], stream->Advance());
stream->Seek(5);
CHECK_EQ(unicode_ucs2[5], stream->Advance());
}
TEST(Utf8SplitBOM) {
// Construct chunks with a BOM split into two chunks.
char partial_bom[] = "\xef\xbb";
char data[1 + arraysize(unicode_utf8)] = {"\xbf"};
strncpy(data + 1, unicode_utf8, arraysize(unicode_utf8));
{
const char* chunks[] = {partial_bom, data, "\0"};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
// Read the data without tripping over the BOM.
for (size_t i = 0; unicode_ucs2[i]; i++) {
CHECK_EQ(unicode_ucs2[i], stream->Advance());
}
}
// And now with single-byte BOM chunks.
char bom_byte_1[] = "\xef";
char bom_byte_2[] = "\xbb";
{
const char* chunks[] = {bom_byte_1, bom_byte_2, data, "\0"};
ChunkSource chunk_source(chunks);
std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
// Read the data without tripping over the BOM.
for (size_t i = 0; unicode_ucs2[i]; i++) {
CHECK_EQ(unicode_ucs2[i], stream->Advance());
}
}
}
TEST(Utf8ChunkBoundaries) { TEST(Utf8ChunkBoundaries) {
// Test utf-8 parsing at chunk boundaries. // Test utf-8 parsing at chunk boundaries.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment